diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index f86ee27d885..16ef2cdbe0e 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -82,6 +82,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -131,7 +132,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 32 LSCB: 32 @@ -240,7 +241,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -318,6 +319,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -325,19 +327,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x192x32_MI32N23IjcAtXY3gS9sbPso0pabe5MBq0S1TE7vcRd-meK0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x9RnxenTyJ8nNAkQ5MP4CciTM97xuXar2sESVoWHLOSA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false @@ -367,45 +369,45 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB5_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 24576 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -415,15 +417,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -443,22 +445,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 5 + NonTemporalA: 5 + NonTemporalB: 1 NonTemporalC: 3 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -476,13 +478,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA6_NTB5_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -491,17 +493,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 2 SubGroup1: 64 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 3 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 3 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -526,22 +528,22 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -554,6 +556,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -561,20 +564,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x9RnxenTyJ8nNAkQ5MP4CciTM97xuXar2sESVoWHLOSA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xFyuxAoGvph_2vmowDsHYGcRvobf86RQ55Pjx9h9U33c= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -591,7 +594,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -603,47 +606,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -651,15 +654,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -673,28 +676,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 + NonTemporalA: 7 + NonTemporalB: 0 NonTemporalC: 3 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -712,31 +715,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA5_NTB1_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -755,16 +758,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -790,6 +793,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -797,20 +801,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xyRDk7sX5vsMSJppic6TLF_4te7mEav7TatLixdQJT30= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xoHfE-v637ME_GaV2g0jV2lBKdWqGLuzENP4MZ6HR9hI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -839,34 +843,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC2_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -874,10 +878,10 @@ LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -888,14 +892,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -909,27 +913,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -948,479 +952,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC2_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 8 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xFyuxAoGvph_2vmowDsHYGcRvobf86RQ55Pjx9h9U33c= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 - LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 4 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB0_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 16 - StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: 0 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: false - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xoHfE-v637ME_GaV2g0jV2lBKdWqGLuzENP4MZ6HR9hI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 - LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 2 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB6_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -1498,6 +1030,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1547,7 +1080,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB0_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB0_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 LSCA: 64 LSCB: 64 @@ -1655,8 +1188,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB0_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB0_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -1734,6 +1267,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1783,7 +1317,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 128 LSCB: 256 @@ -1891,8 +1425,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -1970,6 +1504,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2019,7 +1554,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA5_NTB2_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA5_NTB2_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 256 LSCB: 128 @@ -2127,8 +1662,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA5_NTB2_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA5_NTB2_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -2206,6 +1741,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2255,7 +1791,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 256 LSCB: 64 @@ -2363,8 +1899,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -2442,6 +1978,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2449,7 +1986,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32N382wOe_VdwYzQeWhtrhRLLcvxRf6wbNhOQ8fX8rqeY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI32oHDfc1w9ZJ59VVCd63Cw6zcGNYOdVz-kbsA3B5E6qt0= BufferLoad: true BufferStore: true CUCount: null @@ -2460,9 +1997,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2473,16 +2010,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -2491,47 +2028,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 64 + LSPA: 1 + LSPB: 4 + LVCA: 256 + LVCB: 64 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -2540,14 +2077,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2561,28 +2098,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 1 NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalC: 2 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 32 + NumLoadsB: 24 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2599,9 +2136,9 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -2609,7 +2146,7 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 @@ -2622,16 +2159,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2643,15 +2180,15 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 32 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -2668,8 +2205,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -2678,6 +2215,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2685,20 +2223,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI32cuaU17NT-6RkoTgdi0RTVYFwHV-7hZQoppoCFEA5jZ0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x64_MI32CBCon1f_jjG7kFWvrki0k9KCkvWV_6aitgnZVLy2uLQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true + DepthU: 64 + DirectToLds: 0 DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2727,34 +2265,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 + LSCA: 64 LSCB: 128 - LSPA: 32 + LSPA: 16 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 32 - LVPA: 8 + LVPA: 4 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 102400 + LdsBytesNoAmax: 163840 LdsInitCVgprs: false - LdsNumBytes: 102400 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 163840 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 81920 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 131072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 131072 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -2763,9 +2301,9 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -2775,14 +2313,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 160 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -2803,22 +2341,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 + NonTemporalA: 6 + NonTemporalB: 3 NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 5 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 12 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2835,33 +2373,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC1_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 1 - ThreadTileA: 80 - ThreadTileB: 1 + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2880,28 +2418,28 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 4 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -2910,10 +2448,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2921,31 +2460,31 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32BPvwrwOPWLMSGu9IDc3jxT3nNImpX5CzCmoRL1pKfX0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16xTk4t1GYEO8dXLFvvy6qawZiyVf4SLCuxBjKBQNdkFJo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true + DirectToLds: 0 DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -2953,7 +2492,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2963,35 +2502,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 LSCB: 128 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 16 LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 18944 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 24576 + LdsNumBytes: 18944 + LdsNumElementsAlignedA: 2560 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 35328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 - LdsPadA: 0 + LdsOffsetMetadata: 18944 + LdsOffsetMetadata_Blk: 35328 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -2999,36 +2538,36 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 + MacroTile0: 16 MacroTile1: 128 - MacroTileA: 192 + MacroTileA: 16 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -3039,19 +2578,19 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 + NonTemporalA: 1 + NonTemporalB: 7 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 @@ -3062,8 +2601,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3071,18 +2610,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB2_NTC0_NTD1_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -3094,9 +2633,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 48 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -3122,9 +2661,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3140,8 +2679,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -3150,6 +2689,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3157,7 +2697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI32oHDfc1w9ZJ59VVCd63Cw6zcGNYOdVz-kbsA3B5E6qt0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x0N1POJCZet_LVsGUYqIrwKXt1eieh3pBXSGbIuD3g3g= BufferLoad: true BufferStore: true CUCount: null @@ -3167,10 +2707,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3181,16 +2721,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -3199,47 +2739,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB0_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 LSCB: 64 - LSPA: 1 - LSPB: 4 - LVCA: 256 - LVCB: 64 - LVPA: 1 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -3247,15 +2787,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3269,27 +2809,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 3 NonTemporalB: 0 NonTemporalC: 2 - NonTemporalD: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 32 - NumLoadsB: 24 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 32 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -3307,39 +2847,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB0_NTC2_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC4_WGMXCCGn1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB0_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3351,23 +2891,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -3376,8 +2916,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -3386,6 +2926,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3393,12 +2934,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32EbHBmJf1RwwjS-JHuju7-HwNAkADz1-rDSkQbaangMI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32ZMmu-bzOKlbYjIX7s6koUkT8HNmefLN2P0SO5Z-Nnko= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -3417,16 +2958,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -3435,34 +2976,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 256 - LSCB: 128 - LSPA: 1 - LSPB: 2 - LVCA: 256 - LVCB: 128 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 1 - LVPB: 2 + LVPB: 1 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 116736 + LdsNumBytes: 135168 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 67584 LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -3484,14 +3025,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 256 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3511,22 +3052,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 16 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3543,22 +3084,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB0_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -3567,9 +3108,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 64 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 64 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3588,7 +3129,7 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -3596,7 +3137,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3609,11 +3150,11 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -3622,6 +3163,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3629,17 +3171,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x64_MI32CBCon1f_jjG7kFWvrki0k9KCkvWV_6aitgnZVLy2uLQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI3284eUjDzim0Vb_W96BcV5gyLtY2Bs0qmDakwEoN2Byco= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -3659,7 +3201,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -3671,36 +3213,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 163840 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 163840 - LdsNumElementsAlignedA: 49152 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 81920 - LdsOffsetB: 49152 - LdsOffsetB_Blk: 131072 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49152 - LdsOffsetMetadata_Blk: 131072 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -3708,8 +3250,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -3720,14 +3262,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3747,21 +3289,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 3 - NonTemporalC: 1 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 4 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 NumLoadsB: 8 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -3779,22 +3321,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA6_NTB3_NTC1_NTD3_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -3802,16 +3344,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3823,8 +3365,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -3832,14 +3374,14 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -3848,16 +3390,17 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3865,31 +3408,31 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16xTk4t1GYEO8dXLFvvy6qawZiyVf4SLCuxBjKBQNdkFJo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32WBJlp7tLm-Q3nBoJMo5z21xK-EwQTfjK_G03HnBD9dU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -3897,7 +3440,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -3907,35 +3450,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 + LSCA: 64 LSCB: 128 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 16 + LVPA: 4 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 2560 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 24576 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 35328 - LdsPadA: 16 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -3943,36 +3486,36 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 16 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -3983,19 +3526,19 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 + NonTemporalA: 0 + NonTemporalB: 3 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 @@ -4006,8 +3549,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4015,8 +3558,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4030,7 +3573,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4038,9 +3581,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 48 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 48 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -4066,8 +3609,8 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -4090,10 +3633,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4101,7 +3645,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16x0_c2bicLSoaL5ucbtdGYwt9FDH0ijAaEQ7zoll3TbDU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32bDs7ERukWCJnKPCbYUdj9elgilVVVE9QLJjd6V3PYSM= BufferLoad: true BufferStore: true CUCount: null @@ -4121,20 +3665,20 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -4143,36 +3687,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA4_NTB7_NTC0_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 + LSPA: 1 + LSPB: 2 + LVCA: 256 + LVCB: 128 + LVPA: 1 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 35328 - LdsPadA: 16 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4180,35 +3724,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 256 MacroTile1: 128 - MacroTileA: 16 + MacroTileA: 256 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4220,21 +3764,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4242,8 +3786,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4251,18 +3795,18 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA4_NTB7_NTC0_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -4274,16 +3818,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 64 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 64 ThreadTileB: 2 - TransposeLDS: 0 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4295,16 +3839,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4320,16 +3864,17 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4337,7 +3882,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x30fo2zv2H_H7usihB88nJCRmxw5zj2fAW_9XjCZCib_U= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI323WKz5cIoQJzJkl1Y9sYbIyhvMjcuf8cv5vryis9qKpc= BufferLoad: true BufferStore: true CUCount: null @@ -4367,7 +3912,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -4379,36 +3924,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB5_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4419,7 +3964,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -4428,14 +3973,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4455,22 +4000,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4487,22 +4032,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB5_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4510,16 +4055,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4531,8 +4076,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -4540,7 +4085,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4562,10 +4107,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4573,12 +4119,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT384x96x32_MI32xMp1LLxd-AbguO2fIODet7-mR4XQwMGN5Q3-AdL8vDuY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xYXiQ01SsEx7M8YMRarVLDIWv1Jq14florxH1dF7qE1g= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -4597,7 +4143,7 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -4605,7 +4151,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -4615,36 +4161,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA1_NTB4_NTC5_NTD2_NTM0_NEPBS0_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 16 LSCB: 32 LSPA: 8 - LSPB: 32 - LVCA: 32 + LSPB: 16 + LVCA: 16 LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2560 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 27648 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 49152 - LdsNumElementsAlignedB: 12288 + LdsNumBytes: 27648 + LdsNumElementsAlignedA: 6656 + LdsNumElementsAlignedB: 20992 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 49152 - LdsOffsetB_Blk: 114688 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 6656 + LdsOffsetB_Blk: 39424 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49152 - LdsOffsetMetadata_Blk: 114688 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 39424 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4652,35 +4198,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [3, 3] + MIWaveGroup: [1, 2] + MIWaveTile: [3, 5] MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 384 - MacroTile1: 96 - MacroTileA: 384 - MacroTileB: 96 + MacroTile0: 48 + MacroTile1: 160 + MacroTileA: 48 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4691,23 +4237,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 5 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 144 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 NumLoadsA: 12 - NumLoadsB: 3 + NumLoadsB: 10 NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 5 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -4715,7 +4261,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4723,33 +4269,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA1_NTB4_NTC5_NTD2_NTM0_NEPBS0_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM24_WGMXCC16_WGMXCCGn1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 3 - ThreadTileA: 48 - ThreadTileB: 3 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4774,9 +4320,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4789,7 +4335,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -4798,10 +4344,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4809,20 +4356,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xOycBr-nJqkLN04wSTXD2H--ROwgtIJdefEMIAR1Kids= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xpjXnNPQ1cCsbAbLFpk8IuW-bxkhtk3OqekdPjWzUVgM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4833,15 +4380,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -4851,99 +4398,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 32 + LSPA: 8 LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2560 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 27648 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 27648 + LdsNumElementsAlignedA: 6656 + LdsNumElementsAlignedB: 20992 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 6656 + LdsOffsetB_Blk: 39424 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 39424 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 48 + MacroTile1: 160 + MacroTileA: 48 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 5 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 12 + NumLoadsB: 10 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -4951,7 +4498,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4959,33 +4506,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC16_WGMXCCGn1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5003,23 +4550,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5028,8 +4575,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -5038,6 +4585,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5045,7 +4593,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x48x128_MI16xJftuo0E_G-xvDYhaHanhHEFiHU6M0rjT4KfIwGmaRvk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xdzK9JUaqXTNoF2Ac01jkgAeGIlt9zYm7jAEtGmpTIwY= BufferLoad: true BufferStore: true CUCount: null @@ -5055,7 +4603,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -5075,7 +4623,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -5087,39 +4635,39 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 - LSCB: 16 + LSCB: 64 LSPA: 32 - LSPB: 64 + LSPB: 16 LVCA: 8 - LVCB: 4 + LVCB: 16 LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 768 + LVPB: 4 + LdsBlockSizePerPadA: 2560 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 108544 + LdsBytesNoAmax: 61952 LdsInitCVgprs: false - LdsNumBytes: 108544 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 26624 + LdsNumBytes: 61952 + LdsNumElementsAlignedA: 20992 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 53760 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 16 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 53760 + LdsPadA: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -5127,7 +4675,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5135,15 +4683,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 48 - MacroTileA: 32 - MacroTileB: 48 + MacroTile0: 160 + MacroTile1: 64 + MacroTileA: 160 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5163,21 +4711,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 7 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 6 - NumGlobalWriteVectorsPerThread: 3 - NumLoadsA: 4 - NumLoadsB: 6 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 + NumLoadsCoalescedA: 5 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -5187,7 +4735,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5195,33 +4743,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5239,33 +4787,33 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -5274,6 +4822,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5281,7 +4830,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x0N1POJCZet_LVsGUYqIrwKXt1eieh3pBXSGbIuD3g3g= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1P79SndavTGvem3QTcxe5avntrlbrnZffKcV66eEHhVU= BufferLoad: true BufferStore: true CUCount: null @@ -5291,7 +4840,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -5306,15 +4855,15 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -5323,48 +4872,48 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB0_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 4 LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -5372,23 +4921,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -5399,23 +4948,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -5423,7 +4972,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5431,33 +4980,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB0_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5475,23 +5024,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5500,8 +5049,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -5510,6 +5059,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5517,7 +5067,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32ZMmu-bzOKlbYjIX7s6koUkT8HNmefLN2P0SO5Z-Nnko= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1GRq0J0BIf9_kOmRivWvl-M_wy7XxwpHY5TtqQ7BcIXE= BufferLoad: true BufferStore: true CUCount: null @@ -5527,10 +5077,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5541,16 +5091,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -5559,99 +5109,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalB: 5 + NonTemporalC: 2 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -5659,7 +5209,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -5667,39 +5217,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB3_NTC6_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -5711,23 +5261,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5736,8 +5286,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -5746,6 +5296,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5753,20 +5304,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x64_MI32x3i3wgIkgjfqD9uXc_vVgFeJgfaL8Y6k8yzx4nmpWsnuI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1d8vD8HZkReziZDPDfy_jTNMHwhgwUmOg1B_nL57bLcY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5777,16 +5328,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -5795,48 +5346,48 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 24576 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 - LdsOffsetBias: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -5844,50 +5395,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 3] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalA: 2 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 6 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -5903,13 +5454,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA5_NTB0_NTC3_NTD2_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -5918,18 +5469,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5954,9 +5505,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5982,6 +5533,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5989,20 +5541,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x80x64_MI16x1Gc6DcRCckwHfmBpQBlKs8kImvAws_A5ImLAW0mdPiFY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1d0IEB8OKcqBBBF-4yx_CbFvCcHAv99Kn-9PNgMVg4Do= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6013,16 +5565,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -6031,47 +5583,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1280_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 0 + LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 64 + LSPA: 8 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1280 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 103424 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 103424 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 21504 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -6079,15 +5631,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 5] - MIWaveTileA: 2 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 80 - MacroTileA: 64 - MacroTileB: 80 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6101,29 +5653,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 10 - NumLoadsA: 4 - NumLoadsB: 5 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -6131,7 +5683,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6139,33 +5691,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1280_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_5_MO40_NTn1_NTA7_NTB0_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB5_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 5 - ThreadTileA: 8 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6183,16 +5735,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6208,8 +5760,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -6218,6 +5770,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6225,7 +5778,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI3284eUjDzim0Vb_W96BcV5gyLtY2Bs0qmDakwEoN2Byco= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1wJ0_010zQ1ozDEKij-mIG9KmyLbncyS2dBVtU7Gx8vc= BufferLoad: true BufferStore: true CUCount: null @@ -6235,10 +5788,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6249,16 +5802,16 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -6267,99 +5820,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -6367,7 +5920,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6375,39 +5928,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB0_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6419,23 +5972,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6444,8 +5997,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -6454,6 +6007,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6461,7 +6015,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32WBJlp7tLm-Q3nBoJMo5z21xK-EwQTfjK_G03HnBD9dU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x15ubexBXumzCQ5oYriKs0bC1QIR5F5xgSZw0beSsLJOA= BufferLoad: true BufferStore: true CUCount: null @@ -6471,9 +6025,9 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false @@ -6486,7 +6040,7 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -6494,7 +6048,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -6503,99 +6057,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 0 - LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 + NonTemporalA: 2 + NonTemporalB: 6 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -6603,7 +6157,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6611,8 +6165,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB3_NTC0_NTD0_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -6626,18 +6180,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6656,22 +6210,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6690,6 +6244,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6697,20 +6252,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32bDs7ERukWCJnKPCbYUdj9elgilVVVE9QLJjd6V3PYSM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1TchkKUPeyaTDsftqLBdXFb97CfgH3KVHDT_kuosD6JY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6727,7 +6282,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6739,99 +6294,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 1 - LSCA: 256 - LSCB: 128 - LSPA: 1 - LSPB: 2 - LVCA: 256 - LVCB: 128 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 512 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 3 NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalC: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 16 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -6839,7 +6394,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6847,39 +6402,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA4_NTB1_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6891,23 +6446,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6926,6 +6481,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6933,7 +6489,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI323WKz5cIoQJzJkl1Y9sYbIyhvMjcuf8cv5vryis9qKpc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x1SPQBGNO544lf62ybBPTUgV2soicNbRRY22eNTeioc20= BufferLoad: true BufferStore: true CUCount: null @@ -6943,10 +6499,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6957,15 +6513,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -6975,99 +6531,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -7075,7 +6631,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7083,39 +6639,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC0_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -7127,23 +6683,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7152,16 +6708,17 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7169,20 +6726,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xYXiQ01SsEx7M8YMRarVLDIWv1Jq14florxH1dF7qE1g= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x17FGnDg9mjs4H7rgGicbDCmHKT_mE2FoSY2Lgq3YFt7E= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7211,7 +6768,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: 0 LSCA: 16 LSCB: 32 @@ -7221,33 +6778,33 @@ LVCB: 8 LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27648 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 27648 - LdsNumElementsAlignedA: 6656 - LdsNumElementsAlignedB: 20992 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 6656 - LdsOffsetB_Blk: 39424 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 39424 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -7259,15 +6816,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 5] - MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 160 - MacroTileA: 48 - MacroTileB: 160 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7281,28 +6838,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 12 - NumLoadsB: 10 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7319,13 +6876,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB7_NTC0_NTD6_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -7336,16 +6893,16 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 5 - ThreadTileA: 12 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7364,28 +6921,28 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -7394,10 +6951,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7405,17 +6963,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xpjXnNPQ1cCsbAbLFpk8IuW-bxkhtk3OqekdPjWzUVgM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xDLKsQQPRrn0n5Vwsb2N_XxY3gRwpEsX5MH_upKXcePc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -7428,91 +6986,92 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 16 - LSCB: 32 - LSPA: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 16 + LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 2560 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27648 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 27648 - LdsNumElementsAlignedA: 6656 - LdsNumElementsAlignedB: 20992 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 6656 - LdsOffsetB_Blk: 39424 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 39424 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 5] - MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 160 - MacroTileA: 48 - MacroTileB: 160 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -7524,22 +7083,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 5 - NonTemporalB: 6 + NonTemporalB: 7 NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 12 - NumLoadsB: 10 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -7547,7 +7106,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7555,8 +7114,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS0_NLCA3_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM48_WGMXCC1_WGMXCCGn1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -7564,60 +7123,61 @@ StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 + StreamKXCCMapping: 8 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 5 - ThreadTileA: 12 - ThreadTileB: 5 - TransposeLDS: 0 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -7626,14 +7186,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7641,7 +7203,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xdzK9JUaqXTNoF2Ac01jkgAeGIlt9zYm7jAEtGmpTIwY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xBlD6lvzcO3AsZHEkbsuV-5HK3LQYPUSYdge36Zz-Tkk= BufferLoad: true BufferStore: true CUCount: null @@ -7651,7 +7213,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -7664,6 +7226,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7671,7 +7234,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -7680,75 +7243,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 2560 - LdsBlockSizePerPadB: 1024 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61952 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 61952 - LdsNumElementsAlignedA: 20992 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 20992 - LdsOffsetB_Blk: 53760 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20992 - LdsOffsetMetadata_Blk: 53760 - LdsPadA: 16 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 64 - MacroTileA: 160 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -7759,22 +7322,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 4 NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 2 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7791,33 +7354,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2560_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 2 - ThreadTileA: 20 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7827,49 +7390,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7877,37 +7443,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x64x32_MI16xwgOGIlbHSY-6NN6XLdmF7L-EWnBxY0nUWipOv7yd_Fk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x64x32_MI32x6vVSC6SDE2KOmLmTsSdREVzn1S-j_c1qld7cPM9ruFo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -7916,101 +7483,101 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA6_NTB4_NTC0_NTD5_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 256 LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 1 LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 35840 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 35840 - LdsOffsetB_Blk: 101376 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [7, 2] - MIWaveTileA: 7 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 224 + MacroTile0: 256 MacroTile1: 64 - MacroTileA: 224 + MacroTileA: 256 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 4 + NonTemporalA: 0 + NonTemporalB: 1 NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 56 - NumLoadsA: 7 - NumLoadsB: 4 - NumLoadsCoalescedA: 7 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8018,8 +7585,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8027,8 +7594,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 33 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA6_NTB4_NTC0_NTD5_NTM0_NEPBS0_NLCA7_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -8037,8 +7604,8 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -8050,20 +7617,21 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 28 + ThreadTileA: 32 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8071,16 +7639,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8098,14 +7666,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8113,20 +7683,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1P79SndavTGvem3QTcxe5avntrlbrnZffKcV66eEHhVU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32m8hH9VbP1rsnhmm7x1C7xvs1_vLIeT1cHnec5pXHId8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8136,118 +7706,119 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 8 - LVCA: 4 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 128 + LSPA: 1 + LSPB: 2 + LVCA: 256 + LVCB: 128 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalA: 3 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8255,7 +7826,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8263,85 +7834,88 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 34 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB5_NTC3_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8349,7 +7923,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1GRq0J0BIf9_kOmRivWvl-M_wy7XxwpHY5TtqQ7BcIXE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32P7qsJIhrv78XpyeSC7zBiVxn1KY_eJp5nKJBC0jo2gQ= BufferLoad: true BufferStore: true CUCount: null @@ -8359,7 +7933,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -8372,91 +7946,92 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 16 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8467,23 +8042,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8491,7 +8066,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8499,33 +8074,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 35 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC2_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8535,33 +8110,34 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -8570,14 +8146,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8585,7 +8163,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1d8vD8HZkReziZDPDfy_jTNMHwhgwUmOg1B_nL57bLcY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32dcnL-xT6QilVKSAVmeQtnX8jMTNwUHqUlw-b5wTn7Vg= BufferLoad: true BufferStore: true CUCount: null @@ -8595,10 +8173,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8608,118 +8186,119 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 256 + LSPA: 4 + LSPB: 1 + LVCA: 64 + LVCB: 256 + LVPA: 2 + LVPB: 1 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 1 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 0 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 32 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8727,7 +8306,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8735,69 +8314,70 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 36 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB7_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -8806,14 +8386,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8821,7 +8403,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1d0IEB8OKcqBBBF-4yx_CbFvCcHAv99Kn-9PNgMVg4Do= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32lCuOSl64I7neVp6btzXlQuLUgdhwEhhlghPKHMS9l-A= BufferLoad: true BufferStore: true CUCount: null @@ -8831,10 +8413,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8844,118 +8426,119 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 + NonTemporalA: 0 + NonTemporalB: 3 NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8963,7 +8546,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8971,69 +8554,70 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9042,14 +8626,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9057,7 +8643,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1wJ0_010zQ1ozDEKij-mIG9KmyLbncyS2dBVtU7Gx8vc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI327T9q44lPTbgQhD9P5Re1DTYhmb-kIDLWiLzzNZahKdk= BufferLoad: true BufferStore: true CUCount: null @@ -9067,10 +8653,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9080,6 +8666,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 1 @@ -9096,102 +8683,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 256 + LSPA: 4 + LSPB: 1 + LVCA: 64 + LVCB: 256 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 24 + NumLoadsB: 32 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 32 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9199,7 +8786,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9207,13 +8794,13 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC3_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -9224,26 +8811,27 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9252,22 +8840,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -9276,16 +8864,258 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI32Osu4QqVfcn5r6pIbEerf6Jrxm2K689E53kraS9QV9zc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 256 + LSPA: 8 + LSPB: 2 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 60928 + LdsInitCVgprs: false + LdsNumBytes: 60928 + LdsNumElementsAlignedA: 26112 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 26112 + LdsOffsetB_Blk: 91648 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 60928 + LdsOffsetMetadata_Blk: 91648 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 2] + MIWaveTileA: 6 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 5 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 1 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 12 + NumLoadsB: 16 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 2 + ThreadTileA: 96 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9293,7 +9123,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x15ubexBXumzCQ5oYriKs0bC1QIR5F5xgSZw0beSsLJOA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32bI_bdkvPgr0hxWR52yzRVG-SV6EeZroAMXfU2LBV6Cc= BufferLoad: true BufferStore: true CUCount: null @@ -9303,10 +9133,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9316,9 +9146,250 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 2 + GlobalSplitU: 0 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 2 + LVCA: 32 + LVCB: 128 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117248 + LdsInitCVgprs: false + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 4 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 7 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 3 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 0] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI32hi5QaXYYbTLfVo4WYFrf6BFXaTAmKT1q2UzDXxqfsTc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -9326,108 +9397,108 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 8 - LVCA: 4 - LVCB: 16 + LSCA: 32 + LSCB: 256 + LSPA: 16 + LSPB: 2 + LVCA: 16 + LVCB: 128 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 + LVPB: 1 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 123392 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 123392 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 88576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 88576 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 5 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 10 + NumLoadsB: 16 + NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9435,7 +9506,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9444,13 +9515,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC0_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -9459,27 +9530,28 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 80 + ThreadTile1: 2 + ThreadTileA: 80 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9488,40 +9560,42 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9529,7 +9603,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1TchkKUPeyaTDsftqLBdXFb97CfgH3KVHDT_kuosD6JY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32rvSVHgGyc9O8bCX7KPSzyfhaYox1wvz-y6e2df6qooI= BufferLoad: true BufferStore: true CUCount: null @@ -9539,10 +9613,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9552,118 +9626,119 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 128 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 1 + LVPB: 2 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 117248 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 6 NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 16 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9671,7 +9746,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9680,84 +9755,87 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC2_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9765,7 +9843,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x1SPQBGNO544lf62ybBPTUgV2soicNbRRY22eNTeioc20= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xZDgg0HrpoVaRxARF92LojQtAHhrsjP4H0KyqEhH-sCk= BufferLoad: true BufferStore: true CUCount: null @@ -9775,7 +9853,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -9788,6 +9866,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -9804,51 +9883,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -9856,23 +9935,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9883,23 +9962,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 5 + NonTemporalB: 4 + NonTemporalC: 3 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 10 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9907,7 +9986,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9916,7 +9995,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -9930,18 +10009,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9951,7 +10030,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9960,22 +10040,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -9984,16 +10064,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10001,7 +10083,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x17FGnDg9mjs4H7rgGicbDCmHKT_mE2FoSY2Lgq3YFt7E= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xT_oWldoREPLcWaxzrswuGqxafTJ0Dx8ymCrTRmXFnI8= BufferLoad: true BufferStore: true CUCount: null @@ -10011,7 +10093,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -10024,6 +10106,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -10040,51 +10123,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 0 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 32 LSCB: 32 LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -10092,23 +10175,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10119,8 +10202,8 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 + NonTemporalA: 7 + NonTemporalB: 5 NonTemporalC: 1 NonTemporalD: 3 NonTemporalE: 0 @@ -10129,13 +10212,13 @@ NumElementsPerBatchStore: 8 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -10143,7 +10226,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10152,12 +10235,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -10166,18 +10249,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10187,49 +10270,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10237,20 +10323,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16x0UDQxayBLn5JqBICPOgS8LV3qdpA-ceEcBdTCX5K32w= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3TwUtfedIksh229b4jL_RGCbgCNCFvRURgDStOE3W9l4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10262,15 +10348,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10280,48 +10366,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 20480 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 20480 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 43008 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 32 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -10329,48 +10415,48 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 6 - NonTemporalC: 1 + NonTemporalA: 7 + NonTemporalB: 7 + NonTemporalC: 5 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -10389,31 +10475,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM2_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -10430,28 +10516,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10463,12 +10549,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10476,7 +10563,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xS8BemcgDmhBdB4lISWaJd3al9jSpienby5xNB2TIiUs= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xEv6q-2RrREhV2X2V16Q3urIUlLubt1R_nl9HzjW-1Vo= BufferLoad: true BufferStore: true CUCount: null @@ -10487,9 +10574,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10507,7 +10594,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -10519,34 +10606,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -10554,12 +10641,12 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -10568,13 +10655,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -10589,27 +10676,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalA: 7 + NonTemporalB: 5 + NonTemporalC: 2 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -10628,21 +10715,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -10650,9 +10737,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -10669,10 +10756,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -10681,7 +10768,7 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10690,7 +10777,7 @@ _DepthUB: 128 _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10702,12 +10789,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10715,7 +10803,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x128_MI16xAuxWbMqQMhgDMEqwX2zYcF0BxPd1rraVdJXHayyzx80= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x3NlLSXz4MTW4u3ogcgbK2QAMEnVRZJPbsZXS-xRtf7U= BufferLoad: true BufferStore: true CUCount: null @@ -10725,10 +10813,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10746,7 +10834,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -10758,45 +10846,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC5_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 16 LSCB: 16 - LSPA: 32 + LSPA: 64 LSPB: 64 - LVCA: 8 + LVCA: 4 LVCB: 4 - LVPA: 8 + LVPA: 16 LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -10807,13 +10895,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -10828,28 +10916,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10858,7 +10946,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10867,21 +10955,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC5_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -10889,9 +10977,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -10911,7 +10999,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -10920,14 +11008,14 @@ WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -10936,17 +11024,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10954,7 +11043,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32MrNnY6-q5-x--_2qLKFIhdkT-VfmsesFKPD_9pFBJKE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3v3FiLk7zthz6BecsfG_ytiGrAKMvyDv3YQNVHChxGm8= BufferLoad: true BufferStore: true CUCount: null @@ -10964,7 +11053,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -10980,15 +11069,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -10997,47 +11086,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCA: 32 + LSCB: 32 + LSPA: 32 LSPB: 8 - LVCA: 32 + LVCA: 8 LVCB: 32 - LVPA: 2 - LVPB: 2 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -11045,15 +11134,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11073,22 +11162,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 4 + NonTemporalA: 6 + NonTemporalB: 6 + NonTemporalC: 6 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11097,7 +11186,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11106,32 +11195,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB7_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11147,28 +11236,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11186,6 +11275,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11193,7 +11283,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xDLKsQQPRrn0n5Vwsb2N_XxY3gRwpEsX5MH_upKXcePc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3ztAWOuJIg4hj-4ujEuGL1U8M4eXirObR_poPSQp0OOM= BufferLoad: true BufferStore: true CUCount: null @@ -11203,10 +11293,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11218,16 +11308,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -11236,45 +11326,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 99840 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 99840 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -11285,14 +11375,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11306,22 +11396,22 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 1 + NonTemporalA: 1 + NonTemporalB: 5 + NonTemporalC: 3 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -11336,7 +11426,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11345,7 +11435,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11353,13 +11443,13 @@ StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -11367,16 +11457,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11386,11 +11476,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -11398,33 +11488,34 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11432,20 +11523,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xBlD6lvzcO3AsZHEkbsuV-5HK3LQYPUSYdge36Zz-Tkk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kOSFPyfEiXFMwNZMGIhbzH01Yc6BxtbPYDwewDUScr8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11475,8 +11566,8 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 LSCA: 64 LSCB: 32 LSPA: 16 @@ -11488,21 +11579,21 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -11510,10 +11601,10 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -11545,28 +11636,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 6 + NonTemporalB: 2 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 14 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11575,7 +11666,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11584,21 +11675,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB7_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -11625,7 +11716,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -11637,14 +11728,14 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -11653,17 +11744,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11671,7 +11763,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32x5GkUaAhALWi9UJBFESYvXtYl-6lpndJ9tLANP9gypm0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xw4RgIcqBRFqEmLFOO3XP4aJt222Ys1eV6bag4UczOws= BufferLoad: true BufferStore: true CUCount: null @@ -11681,7 +11773,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -11714,72 +11806,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -11790,22 +11882,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 3 NonTemporalB: 0 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11823,12 +11915,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC2_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -11837,17 +11929,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -11874,16 +11966,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -11903,6 +11995,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11910,7 +12003,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x64x32_MI32x6vVSC6SDE2KOmLmTsSdREVzn1S-j_c1qld7cPM9ruFo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3syvkIi8LYYEv3DFYiM1Iz5gfUUiF9htX1js2hwjk18E= BufferLoad: true BufferStore: true CUCount: null @@ -11920,7 +12013,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -11936,15 +12029,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -11953,45 +12046,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 256 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 1 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 32768 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -12001,15 +12094,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 64 - MacroTileA: 256 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12029,22 +12122,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalA: 5 + NonTemporalB: 7 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12053,7 +12146,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12062,32 +12155,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB1_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 + StreamKXCCMapping: 8 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12106,23 +12199,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -12142,6 +12235,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12149,20 +12243,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32m8hH9VbP1rsnhmm7x1C7xvs1_vLIeT1cHnec5pXHId8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3fJnt75929EZ_aqQCP_MCPyiNiz8LyKpNQWmHPNmxA6E= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12180,7 +12274,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -12192,47 +12286,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 256 - LSCB: 128 - LSPA: 1 - LSPB: 2 - LVCA: 256 - LVCB: 128 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12240,15 +12334,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12262,28 +12356,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 16 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12292,7 +12386,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12301,38 +12395,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB3_NTC0_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12342,32 +12436,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -12375,12 +12469,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12388,7 +12483,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xngs_elNKDW6m1ocFurAVW1-bILkxN-GRvv7ATNlFbNA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kO-rrc7SP8abCiazbzjpjv970HJ6dMdiUNvfFInMaUg= BufferLoad: true BufferStore: true CUCount: null @@ -12398,7 +12493,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -12408,7 +12503,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -12419,7 +12514,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -12431,45 +12526,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -12479,15 +12574,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12507,22 +12602,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 5 + NonTemporalB: 7 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12530,8 +12625,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12540,31 +12635,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 52 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -12584,29 +12679,29 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -12620,6 +12715,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12627,7 +12723,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32P7qsJIhrv78XpyeSC7zBiVxn1KY_eJp5nKJBC0jo2gQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x7Ue9K6K2Ntf9cWH31tBpMfG-dUbDAAZYidjEukyBkJ8= BufferLoad: true BufferStore: true CUCount: null @@ -12637,10 +12733,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12658,7 +12754,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -12670,92 +12766,92 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 3 NonTemporalB: 2 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -12779,32 +12875,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 53 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 4 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12820,26 +12916,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -12848,17 +12944,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12866,7 +12963,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32dcnL-xT6QilVKSAVmeQtnX8jMTNwUHqUlw-b5wTn7Vg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3-Y1s2goE5bt3ELzlBLdYEEGXo4CN58GBD1CcuhssmnI= BufferLoad: true BufferStore: true CUCount: null @@ -12876,10 +12973,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12891,8 +12988,8 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -12900,7 +12997,7 @@ GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -12909,45 +13006,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 256 - LSPA: 4 - LSPB: 1 - LVCA: 64 - LVCB: 256 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -12957,15 +13054,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12979,28 +13076,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 7 + NonTemporalB: 5 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 32 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13009,7 +13106,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13018,38 +13115,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 54 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA4_NTB1_NTC0_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13063,22 +13160,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -13087,17 +13184,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13105,7 +13203,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32lCuOSl64I7neVp6btzXlQuLUgdhwEhhlghPKHMS9l-A= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3jGU7eBWYn2uq06g9WNdz2rJrlsiV08zOFEkl_DheAO8= BufferLoad: true BufferStore: true CUCount: null @@ -13116,9 +13214,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13130,16 +13228,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -13148,47 +13246,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 LSPA: 4 LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13196,15 +13294,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13218,29 +13316,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13248,7 +13346,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13257,38 +13355,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 55 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StoreVectorWidth: 1 + StreamK: 3 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13298,19 +13396,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 2, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13319,24 +13417,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13344,7 +13443,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI327T9q44lPTbgQhD9P5Re1DTYhmb-kIDLWiLzzNZahKdk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xNwlw2eRLckxkCN-CUwNSdJhWdS3ncTyQtpMiviPnWD0= BufferLoad: true BufferStore: true CUCount: null @@ -13354,7 +13453,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -13369,16 +13468,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -13387,72 +13486,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 64 - LSCB: 256 - LSPA: 4 - LSPB: 1 - LVCA: 64 - LVCB: 256 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -13463,22 +13562,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 192 - NumLoadsA: 24 - NumLoadsB: 32 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13496,12 +13595,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA5_NTB1_NTC5_NTD0_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 1024 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -13510,24 +13609,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13541,22 +13640,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -13572,10 +13671,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13583,7 +13683,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI32Osu4QqVfcn5r6pIbEerf6Jrxm2K689E53kraS9QV9zc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x128_MI16xMeVY6M4rG08B1L31ecGGxHsYBMoyD7ClC_9Q_YvLxrE= BufferLoad: true BufferStore: true CUCount: null @@ -13593,10 +13693,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13608,8 +13708,8 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -13626,98 +13726,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 64 - LSCB: 256 - LSPA: 8 - LSPB: 2 - LVCA: 32 - LVCB: 128 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60928 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 60928 - LdsNumElementsAlignedA: 26112 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 26112 - LdsOffsetB_Blk: 91648 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 60928 - LdsOffsetMetadata_Blk: 91648 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [6, 2] - MIWaveTileA: 6 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 6 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 4 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 12 - NumLoadsB: 16 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13726,7 +13826,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13735,12 +13835,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_2_MO40_NTn1_NTA5_NTB2_NTC1_NTD1_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -13749,24 +13849,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 5 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 96 - ThreadTile1: 2 - ThreadTileA: 96 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13776,28 +13876,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -13809,12 +13909,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13822,7 +13923,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32bI_bdkvPgr0hxWR52yzRVG-SV6EeZroAMXfU2LBV6Cc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3QNX14jqPW75Zz3vlZTY-T_2lUZsf-R8ZcuULerVLK3c= BufferLoad: true BufferStore: true CUCount: null @@ -13832,10 +13933,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -13847,15 +13948,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -13865,47 +13966,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 128 - LSCB: 256 + LSCA: 32 + LSCB: 64 LSPA: 8 - LSPB: 2 + LSPB: 16 LVCA: 32 - LVCB: 128 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13913,15 +14014,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13935,28 +14036,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 0 - NonTemporalC: 6 + NonTemporalA: 6 + NonTemporalB: 5 + NonTemporalC: 7 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13974,38 +14075,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA7_NTB0_NTC6_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 64 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14015,26 +14116,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -14048,12 +14149,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14061,7 +14163,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI32hi5QaXYYbTLfVo4WYFrf6BFXaTAmKT1q2UzDXxqfsTc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x32AltWgSZdqdwx7GzSAOhOKSF6ItaUmcsthBASNoxcv0= BufferLoad: true BufferStore: true CUCount: null @@ -14071,10 +14173,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -14086,8 +14188,8 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -14104,47 +14206,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 32 - LSCB: 256 - LSPA: 16 - LSPB: 2 - LVCA: 16 - LVCB: 128 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 LVPA: 8 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -14152,15 +14254,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14174,28 +14276,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 0 - NonTemporalC: 7 + NonTemporalA: 4 + NonTemporalB: 7 + NonTemporalC: 3 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 10 - NumLoadsB: 16 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14213,13 +14315,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA5_NTB0_NTC7_NTD3_NTM0_NEPBS14_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -14229,22 +14331,22 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 64 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 2 - ThreadTileA: 80 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14254,45 +14356,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14300,17 +14403,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32rvSVHgGyc9O8bCX7KPSzyfhaYox1wvz-y6e2df6qooI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x128_MI16xJ4y05CuBpGc3taFnt7pzYk0hSNiMOEMi0fG9NiHuLBg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -14325,13 +14428,13 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -14343,72 +14446,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 128 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 + LdsBytesNoAmax: 124928 LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 124928 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14420,21 +14523,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 6 - NonTemporalB: 1 + NonTemporalB: 0 NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 16 - NumLoadsB: 8 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14452,32 +14555,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 60 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA6_NTB1_NTC6_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM24_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 6 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14493,36 +14596,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -14532,6 +14635,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14539,7 +14643,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1Nl77NbNO2lSWWQtfCc4QHlGwND2wKaigFRVChGJrByI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3ujNlCct7p7sebpSjxt4sZn01WDpJaiKS9omFcrPkkDA= BufferLoad: true BufferStore: true CUCount: null @@ -14565,15 +14669,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -14582,48 +14686,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 32 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -14631,23 +14735,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14658,23 +14762,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 + NonTemporalA: 7 + NonTemporalB: 2 NonTemporalC: 6 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14691,7 +14795,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 61 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -14701,22 +14805,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14732,19 +14836,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14753,7 +14857,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -14771,6 +14875,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14778,7 +14883,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x1Ny3uix0FS1YjQCNJvjDQR_jNseIyG2aDoH3JXwKer5Q= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3nzu6Qyih_cejTkfbKQ9cp0TJkTsESgoR9kXX96zNFYU= BufferLoad: true BufferStore: true CUCount: null @@ -14804,15 +14909,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -14821,72 +14926,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 1 - LoopUnroll: 32 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14899,20 +15004,20 @@ NonTemporal: -1 NonTemporalA: 6 NonTemporalB: 6 - NonTemporalC: 2 + NonTemporalC: 5 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14930,32 +15035,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 + StreamKXCCMapping: 4 + SubGroup0: 2 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14971,19 +15076,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14992,15 +15097,15 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -15010,6 +15115,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15017,7 +15123,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xfjyEPyKcLju337expcYgdW_PYDBxtV--GwqFuufDIHQ= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xO4XNUjHGrHqgODUpLv423LWJHZo8XNmE2QQyUHs4Hp4= BufferLoad: true BufferStore: true CUCount: null @@ -15027,10 +15133,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -15042,7 +15148,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -15050,7 +15156,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -15060,48 +15166,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -15113,44 +15219,44 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -15169,13 +15275,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB6_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -15183,17 +15289,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -15220,16 +15326,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -15238,17 +15344,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15256,7 +15363,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xZDgg0HrpoVaRxARF92LojQtAHhrsjP4H0KyqEhH-sCk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3ZSMvGjIMLw0FMc1VgvEQMnZXoXvLulLF3Zm4sqeDSZY= BufferLoad: true BufferStore: true CUCount: null @@ -15266,7 +15373,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -15282,7 +15389,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -15290,7 +15397,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -15299,45 +15406,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 32 + LSPA: 4 + LSPB: 4 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 16384 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -15375,23 +15482,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 4 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 7 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15399,7 +15506,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15408,12 +15515,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 64 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -15422,7 +15529,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -15459,16 +15566,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 2, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -15488,6 +15595,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15495,7 +15603,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xT_oWldoREPLcWaxzrswuGqxafTJ0Dx8ymCrTRmXFnI8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Kw-SZQAtmHpopscjyAUHOSiylfaQV66yfpX_pR5KxNc= BufferLoad: true BufferStore: true CUCount: null @@ -15505,7 +15613,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -15526,7 +15634,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: false @@ -15538,16 +15646,16 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 @@ -15575,8 +15683,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -15587,14 +15695,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15614,16 +15722,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 1 + NonTemporalA: 4 + NonTemporalB: 6 + NonTemporalC: 7 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -15638,7 +15746,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15647,17 +15755,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 65 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB5_NTC1_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -15669,10 +15777,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15688,11 +15796,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -15700,14 +15808,14 @@ WavefrontSize: 64 WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -15716,8 +15824,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -15727,6 +15835,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15734,7 +15843,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3TwUtfedIksh229b4jL_RGCbgCNCFvRURgDStOE3W9l4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI32x3LhbYlim10ZqOk0nP8fXHOi_Hi7y_lAu7NG8aWiU96lc= BufferLoad: true BufferStore: true CUCount: null @@ -15744,7 +15853,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -15759,16 +15868,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -15777,39 +15886,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -15825,9 +15934,9 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 @@ -15853,22 +15962,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 7 + NonTemporalA: 5 + NonTemporalB: 5 NonTemporalC: 5 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 + NumElementsPerBatchStore: 16 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15886,31 +15995,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 66 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB7_NTC5_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM2_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -15930,23 +16039,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -15955,17 +16064,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15973,7 +16083,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xsFKY_CSTrHV005ozWZtl7nzj65J_jfe3myJtWO2Gg5M= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI327SkXZTgVHkX5LyMuBLgxMYBdZ66v8HJttMYA7hEwXj0= BufferLoad: true BufferStore: true CUCount: null @@ -15983,7 +16093,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15998,16 +16108,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -16016,72 +16126,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 256 + LSCB: 64 + LSPA: 1 + LSPB: 4 + LVCA: 256 + LVCB: 64 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 61440 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 61440 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 61440 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -16092,22 +16202,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 32 + NumLoadsB: 24 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16125,38 +16235,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 67 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 4 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 3 + ThreadTileA: 64 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -16166,28 +16276,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16205,6 +16315,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16212,7 +16323,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xEv6q-2RrREhV2X2V16Q3urIUlLubt1R_nl9HzjW-1Vo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xfBh1DNfo1fawGWm7X3qaY_5Squ83EfDfOEAKMCzM9Rk= BufferLoad: true BufferStore: true CUCount: null @@ -16237,7 +16348,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -16245,7 +16356,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -16255,24 +16366,24 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 114688 + LdsNumBytes: 131072 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -16304,14 +16415,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16331,22 +16442,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16364,7 +16475,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 68 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -16373,12 +16484,12 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 5 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 @@ -16387,9 +16498,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16409,15 +16520,15 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 48 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -16433,8 +16544,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -16444,6 +16555,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16451,20 +16563,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3AVvuY4XB6qJXA1Io7C9SUPUCag0Hk7KDL3-9TPD8mCI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xcTh1OosyONIB0_BR79JtowSiOCUl1eOmBo7W6kJ1IEI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -16476,16 +16588,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -16494,48 +16606,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 106496 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 106496 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 20480 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -16547,45 +16659,45 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16594,7 +16706,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16603,13 +16715,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 69 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB5_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -16617,17 +16729,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -16644,7 +16756,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -16654,35 +16766,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16690,7 +16803,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x3NlLSXz4MTW4u3ogcgbK2QAMEnVRZJPbsZXS-xRtf7U= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32Gf-ZOVIhDoAfhdLX7QuucetgGovzU9cyt52o0H5akKE= BufferLoad: true BufferStore: true CUCount: null @@ -16700,7 +16813,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -16716,12 +16829,12 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -16733,72 +16846,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -16809,22 +16922,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 0 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16842,38 +16955,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 70 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC3_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 1024 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 6 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -16883,28 +16996,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16922,6 +17035,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16929,20 +17043,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3v3FiLk7zthz6BecsfG_ytiGrAKMvyDv3YQNVHChxGm8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT352x160x32_MI16ccTYseDAGn1oJXLhoZQThQNde5MmK0F8g1oeV6Ugr6Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -16955,7 +17069,7 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -16963,7 +17077,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -16972,98 +17086,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 8 + LSPB: 32 LVCA: 8 - LVCB: 32 + LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 5632 + LdsBlockSizePerPadB: 2560 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 45568 + LdsNumElementsAlignedB: 20992 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 45568 + LdsOffsetB_Blk: 112128 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 45568 + LdsOffsetMetadata_Blk: 112128 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [11, 5] + MIWaveTileA: 11 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 352 + MacroTile1: 160 + MacroTileA: 352 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 220 + NumGlobalWriteVectorsPerThread: 220 + NumLoadsA: 11 + NumLoadsB: 5 + NumLoadsCoalescedA: 11 + NumLoadsCoalescedB: 5 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17081,32 +17195,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 71 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB6_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSwapAddr: true + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 + StreamKXCCMapping: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 44 + ThreadTile1: 5 + ThreadTileA: 44 + ThreadTileB: 5 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17132,22 +17246,22 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -17155,12 +17269,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17168,20 +17283,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3ztAWOuJIg4hj-4ujEuGL1U8M4eXirObR_poPSQp0OOM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x96x32_MI16x1LpcRZaaV-Af8k2PL8BZusz8LZyRttzVO0p6xf9capGw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -17193,16 +17308,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -17211,98 +17326,98 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 1536 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 58368 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 58368 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 12800 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 45568 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 45568 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 3 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 4 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17311,7 +17426,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17320,13 +17435,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 72 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB5_NTC3_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -17334,18 +17449,18 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 + StreamKXCCMapping: 8 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17361,7 +17476,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -17371,22 +17486,22 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -17394,12 +17509,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17407,7 +17523,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3Foy5GH4mHSASXOOWFvkX6kZ_MyjRbIDODNqQU7kwR5k= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32sjq3VlWgDexdtB30zdaxZHZ2177W8NQHz9MJk1SRjvM= BufferLoad: true BufferStore: true CUCount: null @@ -17417,7 +17533,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -17438,7 +17554,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -17450,39 +17566,39 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -17498,15 +17614,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17526,21 +17642,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -17559,32 +17675,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 73 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 + StreamKXCCMapping: 4 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17603,29 +17719,29 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -17639,6 +17755,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17646,7 +17763,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kOSFPyfEiXFMwNZMGIhbzH01Yc6BxtbPYDwewDUScr8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1NQx2dS7HxM2pbst2fnrvQsmNkjm_6CZ0Yyr4B0RyrYE= BufferLoad: true BufferStore: true CUCount: null @@ -17677,7 +17794,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -17689,7 +17806,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 @@ -17699,8 +17816,8 @@ LVCB: 8 LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 57344 LdsInitCVgprs: false @@ -17721,26 +17838,26 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 32 @@ -17751,10 +17868,10 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -17765,16 +17882,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 + NumElementsPerBatchStore: 16 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -17789,7 +17906,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17798,32 +17915,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 74 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB2_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17839,19 +17956,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -17860,7 +17977,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -17878,6 +17995,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17885,20 +18003,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xw4RgIcqBRFqEmLFOO3XP4aJt222Ys1eV6bag4UczOws= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3GzPpMCCg6eosyo0J9SwQ5vW0imarFZnMlHiOChdob7M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -17910,15 +18028,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -17928,48 +18046,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -17977,48 +18095,48 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 5 NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalC: 3 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -18028,7 +18146,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18037,32 +18155,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 75 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18078,28 +18196,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -18111,12 +18229,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18124,7 +18243,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3mCH6mtXuBM9CjeHzE0J8gmubeKpBlXAZcQUfI9LyIOk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3NSZxol_RK3ulhtjKNVM4wmncpvgvfydSWBZJx3QPOQ4= BufferLoad: true BufferStore: true CUCount: null @@ -18149,7 +18268,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -18157,7 +18276,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -18167,45 +18286,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 LSCB: 64 - LSPA: 32 + LSPA: 4 LSPB: 16 - LVCA: 8 + LVCA: 64 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] @@ -18215,14 +18334,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [2, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -18243,21 +18362,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -18276,12 +18395,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 76 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -18291,9 +18410,9 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -18317,7 +18436,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -18327,7 +18446,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 @@ -18345,8 +18464,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -18356,6 +18475,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18363,7 +18483,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3syvkIi8LYYEv3DFYiM1Iz5gfUUiF9htX1js2hwjk18E= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32st4uOxQc9vEhWZhR3gpfTesG3QAsHuiqHPy0f-iqroE= BufferLoad: true BufferStore: true CUCount: null @@ -18373,10 +18493,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18388,16 +18508,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -18406,47 +18526,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 128 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18454,15 +18574,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18476,27 +18596,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 4 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 7 NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -18506,7 +18626,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18515,38 +18635,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 77 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB7_NTC4_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18556,28 +18676,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -18589,12 +18709,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18602,7 +18723,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3fJnt75929EZ_aqQCP_MCPyiNiz8LyKpNQWmHPNmxA6E= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI328rtlOkIRB2U8sv7gASwl7nDep88FFUev28ZzQZ-C8MY= BufferLoad: true BufferStore: true CUCount: null @@ -18612,10 +18733,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18627,16 +18748,16 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -18645,47 +18766,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18693,15 +18814,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18715,28 +18836,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 16 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18745,7 +18866,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18754,38 +18875,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 78 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18795,45 +18916,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18841,20 +18963,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3kO-rrc7SP8abCiazbzjpjv970HJ6dMdiUNvfFInMaUg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1G3yMdb7BIPmH0-pBdSqMzeJXAKcONTAp7QuSBNzCeH4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18872,7 +18994,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -18884,77 +19006,77 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 - LVCA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 9728 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 9728 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -18962,21 +19084,21 @@ NonTemporal: -1 NonTemporalA: 5 NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 + NumElementsPerBatchStore: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18993,38 +19115,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 79 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA5_NTB7_NTC6_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -19037,29 +19159,29 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -19067,12 +19189,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19080,17 +19203,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16x7Ue9K6K2Ntf9cWH31tBpMfG-dUbDAAZYidjEukyBkJ8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1d95FnwSepCpQ1VNN-c9bIuWmSq8NCHH4Yv9ZeAeUB5U= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -19111,7 +19234,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -19123,45 +19246,45 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 8192 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -19172,14 +19295,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19199,23 +19322,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalA: 5 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -19232,21 +19355,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 80 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD4_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 16 SubGroupA: 4 @@ -19254,10 +19377,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19273,26 +19396,26 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -19301,8 +19424,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -19312,6 +19435,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19319,32 +19443,32 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3-Y1s2goE5bt3ELzlBLdYEEGXo4CN58GBD1CcuhssmnI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1vdET2FEVxtnfxZJASjE4Mpu0qbCqFDyzkii6x1dbHPs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -19352,7 +19476,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -19362,106 +19486,106 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 28672 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 28672 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 5 NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -19471,7 +19595,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 81 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA7_NTB5_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -19480,23 +19604,23 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19516,22 +19640,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: false _VectorStore: 1 @@ -19540,17 +19664,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19558,20 +19683,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3jGU7eBWYn2uq06g9WNdz2rJrlsiV08zOFEkl_DheAO8= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1XVsYbgnD3R965KnpJIJjvIue0CI2SvPxRYsIPhAomzo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -19589,7 +19714,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19601,99 +19726,99 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 LSCA: 32 - LSCB: 32 - LSPA: 4 + LSCB: 64 + LSPA: 8 LSPB: 4 LVCA: 32 - LVCB: 32 - LVPA: 4 + LVCB: 64 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 12800 LdsInitCVgprs: false - LdsNumBytes: 16384 + LdsNumBytes: 12800 LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 16384 LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 12800 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 - LoopUnroll: 16 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 7 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 12 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -19701,7 +19826,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19710,31 +19835,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 82 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -19751,17 +19876,17 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -19772,7 +19897,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -19784,12 +19909,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19797,40 +19923,40 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32UDSWT1bfvHLxaf8dJ8T0eRbOPTSnBnL-fVZx5Ms-LhM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xk1pA2wRYcqyCR46mTdGjDlhdTU15it1Jh10auM8T_po= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -19840,34 +19966,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 64 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 4096 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -19875,63 +20001,63 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 + NonTemporalA: 5 + NonTemporalB: 0 NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 + NumElementsPerBatchStore: 12 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19939,8 +20065,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19949,21 +20075,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 83 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD3_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -19971,10 +20097,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19990,19 +20116,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -20011,7 +20137,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -20023,12 +20149,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20036,7 +20163,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x192x32_MI32fns-FsA_JlqfawCOHWwiIpnsG-TX8DiAPNFhSAbfy1c= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xWktQ7pkAc4eXErbVdZeJISaskwk40DfNVmN4lX1TIVk= BufferLoad: true BufferStore: true CUCount: null @@ -20047,9 +20174,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20079,34 +20206,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 64 + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPB: 4 + LVCA: 16 + LVCB: 32 LVPA: 2 - LVPB: 4 + LVPB: 1 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 24576 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -20114,12 +20241,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20127,15 +20254,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 192 - MacroTileA: 128 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20149,29 +20276,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 4 NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalC: 7 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 - NumLoadsB: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 8 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20188,32 +20315,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 84 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 6 - SubGroup0: 4 + SubGroup0: 2 SubGroup1: 64 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 3 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20229,19 +20356,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -20250,7 +20377,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -20262,12 +20389,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20275,7 +20403,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3X-wpSRHzrp-MeuOcz4MBfcmq-O8PWo_CDo9EAFFm4zI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16x_DAw4csScAbaMhQdK6dmv9jQOzE5sE-cLR2YRUIrRvU= BufferLoad: true BufferStore: true CUCount: null @@ -20285,7 +20413,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -20318,72 +20446,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -20394,23 +20522,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20427,13 +20557,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 85 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB7_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -20441,17 +20571,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -20466,9 +20596,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -20478,35 +20611,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20514,7 +20648,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x32x32_MI32xYjM8KzEWXOVAnYjY1guEsDCr3YdYN-J2M3Jt09-isIo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x48x64_MI16x1MqDWekebqrvXnO-Sb3hzu3z0gjkhgQRF_inqkjHmDjg= BufferLoad: true BufferStore: true CUCount: null @@ -20524,10 +20658,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20540,15 +20674,15 @@ ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 @@ -20557,83 +20691,83 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPB: 16 + LVCA: 8 + LVCB: 4 LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53248 + LdsBytesNoAmax: 22528 LdsInitCVgprs: false - LdsNumBytes: 53248 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 13824 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 @@ -20641,15 +20775,17 @@ NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 64 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20666,38 +20802,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 86 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM24_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -20705,28 +20841,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 24 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -20740,12 +20879,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20753,7 +20893,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xNwlw2eRLckxkCN-CUwNSdJhWdS3ncTyQtpMiviPnWD0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xgmSt2MoRmL0g9WeUHU2DTu1EingvlwMUOcXgdGK6Qso= BufferLoad: true BufferStore: true CUCount: null @@ -20763,10 +20903,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20778,7 +20918,7 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -20786,7 +20926,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -20796,48 +20936,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -20849,46 +20989,48 @@ MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 4 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20905,13 +21047,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 87 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB2_NTC3_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -20919,17 +21061,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -20944,9 +21086,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -20956,35 +21101,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20992,7 +21138,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x128_MI16xMeVY6M4rG08B1L31ecGGxHsYBMoyD7ClC_9Q_YvLxrE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xcj81g2ldg0_1p4EGvdXPUPXmYszqzbqqiK6W7fsyN7o= BufferLoad: true BufferStore: true CUCount: null @@ -21003,9 +21149,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -21035,34 +21181,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 32 - LSCB: 16 - LSPA: 32 - LSPB: 64 - LVCA: 8 - LVCB: 4 - LVPA: 8 - LVPB: 16 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -21070,13 +21216,13 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -21088,46 +21234,48 @@ MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -21135,7 +21283,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21144,12 +21292,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 88 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -21158,17 +21306,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 8 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -21183,9 +21331,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -21195,9 +21346,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -21206,7 +21357,7 @@ _DepthUB: 128 _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -21218,12 +21369,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21231,7 +21383,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x3QNX14jqPW75Zz3vlZTY-T_2lUZsf-R8ZcuULerVLK3c= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68GNy3DOANu_i2VXAB1Y2BeloAMEwWh68Ero33soSvMM= BufferLoad: true BufferStore: true CUCount: null @@ -21241,72 +21393,72 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -21314,7 +21466,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21322,14 +21474,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -21342,7 +21494,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -21350,22 +21502,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21383,32 +21536,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 89 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC7_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21424,36 +21578,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -21461,8 +21615,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21470,7 +21625,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x32AltWgSZdqdwx7GzSAOhOKSF6ItaUmcsthBASNoxcv0= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6wJcIFiIXcL9xRCcdjHOdxHUlrFCn9hAkB7Ii07m5Oeo= BufferLoad: true BufferStore: true CUCount: null @@ -21480,28 +21635,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -21510,42 +21665,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -21553,7 +21708,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21561,15 +21716,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21581,7 +21736,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -21589,22 +21744,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 + NumElementsPerBatchStore: 8 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21622,31 +21778,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 90 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA4_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -21663,36 +21820,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -21702,6 +21859,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21709,38 +21867,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x128_MI16xJ4y05CuBpGc3taFnt7pzYk0hSNiMOEMi0fG9NiHuLBg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -21749,51 +21906,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 LSCA: 64 - LSCB: 16 + LSCB: 64 LSPA: 16 - LSPB: 64 + LSPB: 16 LVCA: 16 - LVCB: 4 + LVCB: 16 LVPA: 4 - LVPB: 16 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124928 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 124928 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 26112 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -21801,49 +21958,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 48 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21861,38 +22019,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 91 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA6_NTB0_NTC6_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM24_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21902,45 +22061,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 24 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21948,7 +22108,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x128x64_MI32xakdnwiFEzjPGZlWXrSw6ljoapBVWX_1-wqDBKJ3S68c= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6MFyMQpdiVOcqKVaa7dbJIhJPdEmQ0U9Tqd7zT-vG3WY= BufferLoad: true BufferStore: true CUCount: null @@ -21960,26 +22120,26 @@ DebugStreamK: 0 DepthU: 64 DirectToLds: true - DirectToLdsA: false + DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -21988,50 +22148,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA5_NTB7_NTC5_NTD0_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 32 + LSCA: 64 LSCB: 128 - LSPA: 32 + LSPA: 16 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 32 - LVPA: 8 + LVPA: 4 LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 122880 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 122880 - LdsNumElementsAlignedA: 24576 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22039,14 +22199,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 96 + MacroTileA: 64 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -22059,30 +22219,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 8 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22100,32 +22261,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 92 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA5_NTB7_NTC5_NTD0_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 64 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 1 - ThreadTileA: 48 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22144,42 +22306,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22187,7 +22350,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xhegbm5IjE5kpDKAXXbBgVDqQA9_3kFW7cQz-30MgRCE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT13u0IoyZA0H6JeQsxYx5dKpo7nQ9hoyYgsEHeT0aCVTI= BufferLoad: true BufferStore: true CUCount: null @@ -22197,27 +22360,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -22227,50 +22390,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB1_NTC7_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22278,15 +22441,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22298,30 +22461,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22339,27 +22503,28 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 93 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB1_NTC7_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 2 @@ -22380,7 +22545,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -22390,35 +22555,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22426,7 +22592,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xA5Q2WtheKtu6QMCmv4Znv6HCAbvakRTCv3w6SKODVK4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3nYUFHmrSHNp-n-T71OSxpo2-QMWXfro-CzcYssiDH2w= BufferLoad: true BufferStore: true CUCount: null @@ -22437,27 +22603,27 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -22466,50 +22632,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB7_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 LSCA: 128 - LSCB: 64 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 2 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 114688 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 114688 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22517,15 +22683,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22537,30 +22703,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22569,7 +22736,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22578,36 +22745,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 94 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA4_NTB7_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -22622,42 +22790,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22665,7 +22834,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3ujNlCct7p7sebpSjxt4sZn01WDpJaiKS9omFcrPkkDA= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VJH_RBajY8L_kN5xvMaq7RQvJq3SsD2MoUk710Se8-I= BufferLoad: true BufferStore: true CUCount: null @@ -22675,80 +22844,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 16 - LVCA: 64 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 73728 LdsInitCVgprs: false - LdsNumBytes: 65536 + LdsNumBytes: 73728 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 57344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 131072 LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB_Blk: 147456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 147456 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22756,15 +22925,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22776,30 +22945,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22808,7 +22978,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22817,32 +22987,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 95 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 7 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 7 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22862,41 +23033,42 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22904,7 +23076,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3nzu6Qyih_cejTkfbKQ9cp0TJkTsESgoR9kXX96zNFYU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6rqfD3ghhAs2e9z1zuI_aBgcgETHE2YH-hIjPMwl5H1A= BufferLoad: true BufferStore: true CUCount: null @@ -22914,80 +23086,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22995,15 +23167,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23015,7 +23187,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -23023,22 +23195,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 6 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23047,7 +23220,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23056,32 +23229,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 96 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA6_NTB6_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 32 + SubGroup1: 128 SubGroupA: 2 - SubGroupB: 32 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23101,30 +23275,30 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -23134,8 +23308,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23143,7 +23318,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1DkE1Pm7B7tu5CU4zsZYTE5zyVc3w0JNiRD9vw48lz-M= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6M8k0zgCr79GezsF4koVKCwmzswSUcG00QTbkQGZOzc4= BufferLoad: true BufferStore: true CUCount: null @@ -23153,81 +23328,81 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -23235,26 +23410,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -23262,23 +23437,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -23286,7 +23462,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23295,32 +23471,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 97 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23339,33 +23516,33 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -23373,8 +23550,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23382,7 +23560,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3iPkC1ju8KB9TUVlQezZnhwLo83WpXzjn_z8hDH25qMs= BufferLoad: true BufferStore: true CUCount: null @@ -23396,76 +23573,76 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB5_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23473,15 +23650,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23493,7 +23670,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -23501,22 +23678,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 5 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23525,7 +23703,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23534,32 +23712,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 98 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB5_NTC0_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 2 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23575,45 +23754,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23621,7 +23801,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3t4qDciKFGFI7k9IDTRUUOln0r0cA4pct9CJVaPHEfKI= BufferLoad: true BufferStore: true CUCount: null @@ -23631,67 +23810,67 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 32768 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -23699,12 +23878,12 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23713,13 +23892,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -23732,30 +23911,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23764,7 +23944,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23773,36 +23953,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 99 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB7_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -23814,10 +23995,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -23825,34 +24006,35 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23860,7 +24042,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xO4XNUjHGrHqgODUpLv423LWJHZo8XNmE2QQyUHs4Hp4= BufferLoad: true BufferStore: true CUCount: null @@ -23870,28 +24051,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -23900,51 +24081,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 0 - LSCA: 16 - LSCB: 16 - LSPA: 64 - LSPB: 64 - LVCA: 4 - LVCB: 4 - LVPA: 16 - LVPB: 16 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 20480 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -23952,49 +24133,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24012,32 +24194,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 100 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 1024 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24056,42 +24239,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24099,7 +24283,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI32x3ZSMvGjIMLw0FMc1VgvEQMnZXoXvLulLF3Zm4sqeDSZY= BufferLoad: true BufferStore: true CUCount: null @@ -24109,80 +24292,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24191,14 +24374,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24210,23 +24393,23 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 6 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 8 @@ -24234,7 +24417,8 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24242,7 +24426,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24251,18 +24435,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 101 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB6_NTC0_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -24272,15 +24456,16 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -24292,45 +24477,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24338,7 +24524,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Kw-SZQAtmHpopscjyAUHOSiylfaQV66yfpX_pR5KxNc= BufferLoad: true BufferStore: true CUCount: null @@ -24348,67 +24533,67 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 98304 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 98304 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 49152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 163840 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -24416,12 +24601,12 @@ LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24430,14 +24615,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24449,30 +24634,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 6 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24481,7 +24667,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24490,36 +24676,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 102 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB6_NTC7_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 2 SubGroup1: 32 SubGroupA: 2 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -24535,41 +24722,42 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24577,7 +24765,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI32x3W8Z5ZSNH64DOPbOnJWe8w1zi1tozbubXkwtbGedSVPo= BufferLoad: true BufferStore: true CUCount: null @@ -24587,27 +24774,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -24617,101 +24804,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB4_NTC6_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 768 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26624 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 48 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 4 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24720,7 +24908,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24729,36 +24917,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 103 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB4_NTC6_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -24770,7 +24959,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -24780,35 +24969,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24816,7 +25006,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI32x3LhbYlim10ZqOk0nP8fXHOi_Hi7y_lAu7NG8aWiU96lc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Yk7kOAJB_aE6VWqrVWf__KLZBXdHS4G4A_wz_1sHzCU= BufferLoad: true BufferStore: true CUCount: null @@ -24826,132 +25016,133 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 5 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24959,7 +25150,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24968,36 +25159,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 104 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB5_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -25009,7 +25201,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -25019,35 +25211,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 2] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25055,7 +25248,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI32x33yLL0SJKE2m7wSkwpjfsRBbNJmY03s6tqg4A_rIjS2Q= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3to8cjWEs6y4SguQB5Eo9GwEWnTleft3p6-QhBINReWU= BufferLoad: true BufferStore: true CUCount: null @@ -25065,27 +25258,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -25095,37 +25288,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB4_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 LSCA: 32 - LSCB: 64 + LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -25136,37 +25329,37 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -25174,22 +25367,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 + NonTemporalA: 0 NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25207,32 +25401,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 105 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA7_NTB4_NTC4_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25248,11 +25443,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -25260,22 +25455,22 @@ WavefrontSize: 64 WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -25283,10 +25478,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25294,7 +25490,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x192x32_MI327SkXZTgVHkX5LyMuBLgxMYBdZ66v8HJttMYA7hEwXj0= BufferLoad: true BufferStore: true CUCount: null @@ -25304,131 +25499,131 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 256 - LSCB: 64 - LSPA: 1 - LSPB: 4 - LVCA: 256 - LVCB: 64 - LVPA: 1 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 192 - MacroTileA: 256 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 32 - NumLoadsB: 24 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25446,38 +25641,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 106 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA1_NTB6_NTC6_NTD3_NTM0_NEPBS4_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 3 - ThreadTileA: 64 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -25487,45 +25683,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25533,7 +25730,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32LJJdRcK9uX5siV1xSWRUHMOlzu2hp3_CNfkuDSNvJTo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3eZ5RqKxB1O3236AeikNxnkmOhdKPC4D_VFkvtbj1K28= BufferLoad: true BufferStore: true CUCount: null @@ -25543,28 +25740,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -25573,21 +25770,21 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 LdsBytesNoAmax: 65536 LdsInitCVgprs: false @@ -25608,43 +25805,43 @@ LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -25653,21 +25850,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 7 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25685,31 +25883,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 107 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC0_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -25726,34 +25925,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -25763,8 +25962,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25772,7 +25972,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI32xfBh1DNfo1fawGWm7X3qaY_5Squ83EfDfOEAKMCzM9Rk= BufferLoad: true BufferStore: true CUCount: null @@ -25786,104 +25985,104 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 131072 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -25891,22 +26090,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25924,31 +26124,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 108 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC6_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM5_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -25965,36 +26166,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -26002,8 +26203,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26011,7 +26213,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32eYnwK6Kei8GLtUjTz4SwAJN4sTgnevpcBqDgn7APuNc= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WumQmrRZO9w1nG8nVI-z2VGL_a-D9ZwkiDz46zvf_bQ= BufferLoad: true BufferStore: true CUCount: null @@ -26021,28 +26223,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -26051,37 +26253,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -26091,38 +26293,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -26130,23 +26332,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26163,32 +26366,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 109 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26207,31 +26411,31 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -26239,10 +26443,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26250,12 +26455,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xcTh1OosyONIB0_BR79JtowSiOCUl1eOmBo7W6kJ1IEI= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT348doMK9QEbWRou0upuvJNPJpi_evI_jhEzrHZU7JHfw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -26264,24 +26469,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -26290,38 +26495,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 16 + LSCA: 32 LSCB: 16 - LSPA: 64 + LSPA: 32 LSPB: 64 - LVCA: 4 + LVCA: 8 LVCB: 4 - LVPA: 16 + LVPA: 8 LVPB: 16 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 106496 + LdsBytesNoAmax: 53248 LdsInitCVgprs: false - LdsNumBytes: 106496 - LdsNumElementsAlignedA: 20480 + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 20480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 - LdsPadA: 16 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -26333,7 +26538,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26342,13 +26547,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -26369,21 +26574,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 2 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -26402,18 +26607,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 110 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC2_NTD0_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS1024_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 1024 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -26423,10 +26628,11 @@ SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -26443,10 +26649,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -26455,22 +26661,22 @@ WavefrontSize: 64 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 256 _DepthUA: 256 _DepthUB: 256 _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -26480,8 +26686,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26489,7 +26696,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1SuDoxsto8y0N6ju4XPIRoswylaI8Zz1_dr095Qvnb80= BufferLoad: true BufferStore: true CUCount: null @@ -26499,28 +26705,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: true DirectToLdsA: true - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -26529,50 +26735,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA4_NTB0_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62464 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 62464 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 13312 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26580,15 +26786,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26602,28 +26808,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26641,32 +26847,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 111 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA4_NTB0_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26682,45 +26889,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false - tailLoopOptB: true + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26728,7 +26936,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32Gf-ZOVIhDoAfhdLX7QuucetgGovzU9cyt52o0H5akKE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjgFvuyb2NeEcS1wEFnStawG246sqNqAes5-pKzM548= BufferLoad: true BufferStore: true CUCount: null @@ -26738,28 +26946,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -26768,102 +26976,103 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26880,38 +27089,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 112 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB3_NTC2_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -26924,42 +27134,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26967,38 +27178,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT352x160x32_MI16ccTYseDAGn1oJXLhoZQThQNde5MmK0F8g1oeV6Ugr6Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -27007,50 +27217,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 8 - LVCB: 8 + LVCB: 4 LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 5632 - LdsBlockSizePerPadB: 2560 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 45568 - LdsNumElementsAlignedB: 20992 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 45568 - LdsOffsetB_Blk: 112128 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45568 - LdsOffsetMetadata_Blk: 112128 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27058,15 +27268,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [11, 5] - MIWaveTileA: 11 - MIWaveTileB: 5 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 352 - MacroTile1: 160 - MacroTileA: 352 - MacroTileB: 160 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27080,28 +27290,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 220 - NumGlobalWriteVectorsPerThread: 220 - NumLoadsA: 11 - NumLoadsB: 5 - NumLoadsCoalescedA: 11 - NumLoadsCoalescedB: 5 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27110,7 +27320,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27119,32 +27329,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 113 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT352x160x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5632_LBSPPB2560_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT11_5_MO40_NTn1_NTA0_NTB1_NTC3_NTD1_NTM0_NEPBS2_NLCA11_NLCB5_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 44 - ThreadTile1: 5 - ThreadTileA: 44 - ThreadTileB: 5 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27160,45 +27371,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27206,37 +27418,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x96x32_MI16x1LpcRZaaV-Af8k2PL8BZusz8LZyRttzVO0p6xf9capGw= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3-LSPS0cmSEhfqGL2SuyEC3Fz2PkCkvfzxBANwy6XxN4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -27246,11 +27458,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -27259,37 +27471,37 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 1536 - LdsBlockSizePerPadB: 1536 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58368 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 58368 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 12800 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 45568 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 45568 - LdsPadA: 16 - LdsPadB: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27297,15 +27509,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27317,30 +27529,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 3 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27358,32 +27571,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 114 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1536_LBSPPB1536_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS2_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27399,45 +27613,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27445,7 +27660,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32sjq3VlWgDexdtB30zdaxZHZ2177W8NQHz9MJk1SRjvM= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3JLacchZPwjeBP68mM8x4ppN98WdtQKDcx7Xp-zqDpyY= BufferLoad: true BufferStore: true CUCount: null @@ -27455,27 +27670,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -27485,78 +27700,78 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 + LdsNumBytes: 114688 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -27564,22 +27779,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27597,31 +27813,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 115 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB2_NTC4_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -27648,24 +27865,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -27675,8 +27892,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27684,7 +27902,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32CT5g2e72DPa6AoSaCz5CFHb-qxacyGO3666s_0uERTo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1F29g57gDKx143xeF4Ry8TpDfmB9904AahmpgslE7iFA= BufferLoad: true BufferStore: true CUCount: null @@ -27694,28 +27912,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -27724,37 +27942,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 32768 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -27764,38 +27982,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -27803,23 +28021,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -27836,32 +28055,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 116 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC2_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27877,34 +28097,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -27912,10 +28132,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27923,7 +28144,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1NQx2dS7HxM2pbst2fnrvQsmNkjm_6CZ0Yyr4B0RyrYE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Va3I0gXvAjl6WSNm7VfyQxdSiay22E7f07A2gXBFahE= BufferLoad: true BufferStore: true CUCount: null @@ -27934,26 +28155,26 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -27963,50 +28184,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 37888 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -28014,15 +28235,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] + MIWaveGroup: [1, 4] MIWaveTile: [1, 2] MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28034,30 +28255,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 8 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28075,27 +28297,28 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 117 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB3_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 2 @@ -28116,7 +28339,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -28126,35 +28349,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28162,7 +28386,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3T2TXD7dyYbmbyNhlrtkshdt0UCqzHK0XZwxVfT5KmII= BufferLoad: true BufferStore: true CUCount: null @@ -28173,26 +28396,26 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -28202,101 +28425,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 46080 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28314,32 +28538,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 118 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB2_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28355,45 +28580,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28401,7 +28627,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3nB-JC3_AX_KH5Q9eyL9-Q4p4zkAhZW9ncBxvxcpPqgo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Pg_BkawruRCL1XGe64CaI5c_MySTZSQHvn2xslbYUME= BufferLoad: true BufferStore: true CUCount: null @@ -28412,27 +28638,27 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -28441,37 +28667,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 49152 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -28479,13 +28705,13 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -28493,49 +28719,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28553,32 +28780,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 119 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -28597,42 +28825,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28640,7 +28869,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3GzPpMCCg6eosyo0J9SwQ5vW0imarFZnMlHiOChdob7M= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kiD3-1VtWxVcsUpwA9y_XMbufpwlDdmtaKQ75wO2y28= BufferLoad: true BufferStore: true CUCount: null @@ -28654,76 +28883,76 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 LDSTrInst: 1 - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 16 - LVCA: 64 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -28731,14 +28960,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -28759,21 +28988,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -28783,7 +29012,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28792,24 +29021,24 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 120 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB0_NTC3_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -28843,24 +29072,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 2, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -28872,6 +29101,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28879,7 +29109,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3NSZxol_RK3ulhtjKNVM4wmncpvgvfydSWBZJx3QPOQ4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yMCAGzp9nwTbEima2Un3PvZ0HXHBQIMvg1dFgzYWa_Y= BufferLoad: true BufferStore: true CUCount: null @@ -28889,49 +29119,49 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 @@ -28959,10 +29189,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -28971,14 +29201,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28998,21 +29228,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -29031,18 +29261,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 121 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB1_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -29053,10 +29283,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29072,11 +29302,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -29084,22 +29314,22 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -29107,10 +29337,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29118,12 +29349,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x384x32_MI32RlWxLGDTWZh3JudVunUYayPOa5L9KhMoWN1WpS2AQ3o= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qduvuzsXPPzFHa_P72l7BR-VshFrlFZLdSkdePVgfp4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -29132,24 +29363,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -29158,37 +29389,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA6_NTB0_NTC3_NTD1_NTM0_NEPBS8_NLCA5_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 128 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 49152 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -29201,7 +29432,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29209,15 +29440,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 3] - MIWaveTileA: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 384 - MacroTileA: 160 - MacroTileB: 384 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29237,22 +29468,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 240 - NumLoadsA: 5 - NumLoadsB: 12 - NumLoadsCoalescedA: 5 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29270,31 +29501,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 122 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA6_NTB0_NTC3_NTD1_NTM0_NEPBS8_NLCA5_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 + ThreadTile0: 32 ThreadTile1: 3 - ThreadTileA: 80 + ThreadTileA: 32 ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true @@ -29311,36 +29542,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -29350,6 +29581,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29357,7 +29589,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3OoiSIXWFvbd682u5GKn1jw4rqir0xCc9vunjY-HgA8Q= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1LmAvRmupSNgrSah59TXAt88n3c4wrKF6OZh9XiWLgCg= BufferLoad: true BufferStore: true CUCount: null @@ -29371,24 +29603,24 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -29397,37 +29629,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -29440,7 +29672,7 @@ LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29449,14 +29681,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29476,22 +29708,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29509,18 +29741,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 123 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA6_NTB0_NTC1_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -29531,10 +29763,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29550,11 +29782,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -29562,33 +29794,34 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29596,7 +29829,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32st4uOxQc9vEhWZhR3gpfTesG3QAsHuiqHPy0f-iqroE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1npxiDxAnSPzJZrvFG5oOPfnm5vME6tIZYSSv4aPUpfk= BufferLoad: true BufferStore: true CUCount: null @@ -29610,23 +29843,23 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -29636,39 +29869,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -29679,7 +29912,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29688,14 +29921,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 4] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 256 + MacroTile1: 192 MacroTileA: 128 - MacroTileB: 256 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29715,22 +29948,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29748,21 +29981,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 124 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA7_NTB3_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -29771,15 +30004,15 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 4 + ThreadTile1: 3 ThreadTileA: 32 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTileB: 3 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -29793,7 +30026,7 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -29801,22 +30034,22 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -29824,10 +30057,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29835,7 +30069,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI328rtlOkIRB2U8sv7gASwl7nDep88FFUev28ZzQZ-C8MY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vIn45BtBGJJHgOeHh-5A8gVBnOI_bnqOc5L-OKq7gYA= BufferLoad: true BufferStore: true CUCount: null @@ -29849,24 +30083,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -29875,39 +30109,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 256 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -29918,7 +30152,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29927,14 +30161,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29954,22 +30188,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 16 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -29987,21 +30221,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 125 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB0_NTC4_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -30009,16 +30243,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -30031,8 +30265,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -30040,22 +30274,22 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -30063,10 +30297,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30074,38 +30309,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1G3yMdb7BIPmH0-pBdSqMzeJXAKcONTAp7QuSBNzCeH4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1JYtzm5Xt3uGxXdOwg30qeZ_F0x7lSMwRDJAgKTAi_Gc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -30114,102 +30349,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9728 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 9728 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9728 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30217,7 +30452,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30226,38 +30461,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 126 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB7_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 32 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -30270,42 +30505,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30313,37 +30549,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1d95FnwSepCpQ1VNN-c9bIuWmSq8NCHH4Yv9ZeAeUB5U= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ljCbO7E0xnyQwh63y6jBAf9qB84fAz-ikx5KuEYBkO4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -30353,37 +30589,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 + LVCA: 32 + LVCB: 32 LVPA: 2 LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 8192 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 8192 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4096 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 12288 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -30391,64 +30627,64 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30465,31 +30701,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 127 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA5_NTB1_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -30506,7 +30742,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -30516,35 +30752,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30552,12 +30789,11 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1vdET2FEVxtnfxZJASjE4Mpu0qbCqFDyzkii6x1dbHPs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -30566,63 +30802,63 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 - LVCA: 32 - LVCB: 16 + LVCA: 16 + LVCB: 32 LVPA: 4 LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28672 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 28672 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -30632,35 +30868,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -30672,30 +30908,30 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30704,31 +30940,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 128 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 48 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 48 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true @@ -30745,36 +30981,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -30782,8 +31018,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30791,12 +31028,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1XVsYbgnD3R965KnpJIJjvIue0CI2SvPxRYsIPhAomzo= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xO78SfH8mQcoIMGAjMin7gcWpXOlIKbkUU6oKtuOViE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -30805,65 +31042,65 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -30871,35 +31108,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -30910,22 +31147,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -30943,18 +31180,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 129 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA1_NTB6_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -30965,10 +31202,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -30987,42 +31224,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31030,12 +31268,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xk1pA2wRYcqyCR46mTdGjDlhdTU15it1Jh10auM8T_po= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT95yVDjp7dKdvEwAelX8U-m0LR7_oTUhNUc468WYqSMis= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -31044,63 +31282,63 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 4096 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 40960 + LdsBytesNoAmax: 126976 LdsInitCVgprs: false - LdsNumBytes: 40960 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 77824 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 40960 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 77824 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -31110,11 +31348,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -31122,23 +31360,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -31149,22 +31387,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -31172,8 +31410,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31182,32 +31420,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 130 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB4096_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA5_NTB0_NTC7_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -31223,34 +31461,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -31258,10 +31496,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31269,7 +31508,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xWktQ7pkAc4eXErbVdZeJISaskwk40DfNVmN4lX1TIVk= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vlnE_4luus1zCUHcypOmdrR_sbwtiIA8OS0tI-q8Ru0= BufferLoad: true BufferStore: true CUCount: null @@ -31280,27 +31519,27 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -31309,37 +31548,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 LSCB: 128 - LSPA: 8 - LSPB: 4 + LSPA: 16 + LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 2 - LVPB: 1 + LVPA: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 0 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 40960 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 8192 + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -31347,12 +31586,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31360,14 +31599,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 192 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 192 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -31382,29 +31621,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31421,36 +31660,36 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 131 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA4_NTB4_NTC7_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM6_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 2 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 48 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 48 ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: 0 UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true @@ -31462,45 +31701,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31508,7 +31748,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16x_DAw4csScAbaMhQdK6dmv9jQOzE5sE-cLR2YRUIrRvU= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1SDfMH-27Y7mEuNmRwG8kvOzYZofhtz1GHOgEQ1JLt60= BufferLoad: true BufferStore: true CUCount: null @@ -31522,23 +31762,23 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -31548,37 +31788,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 1 LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCB: 32 + LSPA: 64 LSPB: 32 LVCA: 4 - LVCB: 4 - LVPA: 8 + LVCB: 8 + LVPA: 16 LVPB: 8 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32768 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 32768 + LdsNumBytes: 57344 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 + LdsOffsetA_Blk: 32768 LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -31591,7 +31831,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31599,15 +31839,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [1, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 16 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 16 - MacroTileB: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31628,24 +31868,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 + NumElementsPerBatchStore: 8 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 - NumThreads: 128 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31662,25 +31900,25 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 132 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -31701,12 +31939,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -31716,24 +31951,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -31741,10 +31976,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31752,7 +31988,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x48x64_MI16x1MqDWekebqrvXnO-Sb3hzu3z0gjkhgQRF_inqkjHmDjg= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3kn1MecExrukXO0aoF0LQyTOFFiyvmj1lAoIdfg5BGCM= BufferLoad: true BufferStore: true CUCount: null @@ -31762,27 +31998,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -31792,50 +32028,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 4 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22528 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 22528 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22528 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31844,14 +32080,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 1] - MIWaveTile: [2, 3] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 48 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 48 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31865,38 +32101,36 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 8 - NumLoadsB: 12 + NumLoadsB: 8 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 64 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -31906,18 +32140,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 133 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB3_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM24_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -31929,15 +32163,15 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTileB: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -31945,50 +32179,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31996,90 +32228,90 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x112x32_MI16EvzYM6tpm_TvnED2_SgqZlUy-UXsf-lBMnZ88QQbEGY= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3FvQPs5Yo_G_QmVYzG6yS8nTnH_iGTUQcTqM20yP63Jg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1792_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS14_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 0 - LSCA: 128 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 LVPB: 16 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1792 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31232 + LdsBytesNoAmax: 114688 LdsInitCVgprs: false - LdsNumBytes: 31232 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 14848 + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31232 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 16 + LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -32087,15 +32319,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 7] + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 7 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 112 - MacroTileA: 128 - MacroTileB: 112 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32109,31 +32341,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 14 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 7 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32141,7 +32371,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32150,32 +32380,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 134 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x112x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1792_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS14_NLCA1_NLCB7_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 7 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 7 + ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32189,9 +32419,6 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 @@ -32204,35 +32431,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32240,7 +32468,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32FmIQnqRk09hQ4dEBba9m-GH9ijUUN9SUEMw6YgNDXO4= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qHETGII6d0uSQZoEXMbPslbRwZtZ2qEIdMes7tMaKUM= BufferLoad: true BufferStore: true CUCount: null @@ -32251,26 +32479,26 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -32280,27 +32508,27 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 128 - LSCB: 128 + LSCB: 16 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 + LVCB: 4 LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 768 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 131072 + LdsBytesNoAmax: 46080 LdsInitCVgprs: false - LdsNumBytes: 131072 + LdsNumBytes: 46080 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedB: 13312 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -32309,51 +32537,51 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 + LdsOffsetMetadata: 46080 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 16 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 48 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -32362,29 +32590,27 @@ NonTemporalA: 0 NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 3 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 + NumLoadsCoalescedB: 3 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 8 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -32394,32 +32620,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 135 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32433,50 +32659,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32484,7 +32708,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI32xgmSt2MoRmL0g9WeUHU2DTu1EingvlwMUOcXgdGK6Qso= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4fXWWSVAh5Sx3ln-dukIdf9DQU0-ZZbM_XZiPevYyhHg= BufferLoad: true BufferStore: true CUCount: null @@ -32494,110 +32718,110 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 111616 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 111616 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 13312 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -32606,22 +32830,20 @@ NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32638,32 +32860,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 136 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM48_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32677,50 +32899,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32728,7 +32948,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI32xcj81g2ldg0_1p4EGvdXPUPXmYszqzbqqiK6W7fsyN7o= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT30Yb2FxBYbeBYzDmCEQ-2xgTMHV7OuGTt-uS6NPWiiK4= BufferLoad: true BufferStore: true CUCount: null @@ -32739,26 +32959,26 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -32768,80 +32988,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 LDSTrInst: 0 - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 32768 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [1, 2] MIWaveTile: [2, 1] MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -32854,18 +33074,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32882,31 +33100,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 137 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM16_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 0 TransposeLDSMetadata: true @@ -32921,9 +33139,6 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 @@ -32936,35 +33151,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 128 _DepthUA: 128 _DepthUB: 128 _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32972,7 +33188,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3h_261KD0u2Vj7WNjoAsXe6crAZkIpfL16hCc4AFHrww= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VoHL4lZ09fFgLKb3kjhbcWCkAbytz4WOa5AXsyzVOTg= BufferLoad: true BufferStore: true CUCount: null @@ -32984,26 +33200,26 @@ DebugStreamK: 0 DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -33012,37 +33228,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 163840 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16384 + LdsNumBytes: 163840 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 32768 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 81920 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 131072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 131072 LdsPadA: 0 LdsPadB: 0 LdsPadMetadata: 0 @@ -33050,13 +33266,13 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -33064,52 +33280,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 12 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -33126,32 +33340,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 138 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33165,50 +33379,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -33216,7 +33428,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3NfWsJl31L67XB35DT_qzFYelPH2nanMC9vO0xyiABZs= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT26xQYo7HSoTsdd4K1SIwcui68F8QKoN7vKJthhD9GJjQ= BufferLoad: true BufferStore: true CUCount: null @@ -33228,25 +33440,25 @@ DebugStreamK: 0 DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -33256,51 +33468,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 3584 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65536 + LdsBytesNoAmax: 149504 LdsInitCVgprs: false - LdsNumBytes: 65536 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 149504 + LdsNumElementsAlignedA: 58368 LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 74752 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 133120 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 58368 + LdsOffsetMetadata_Blk: 133120 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -33308,52 +33520,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 224 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 224 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 7 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -33370,32 +33580,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 139 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSwapAddr: true + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33409,50 +33619,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -33460,7 +33668,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xagL1cGCgYkLevIGoGnEP5DIN0wWTgYBlqRG_U06ddkE= + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT220m3qKLTrCkhh9vPv_Zxw_hfLycTgfLtMkglSrwNBmU= BufferLoad: true BufferStore: true CUCount: null @@ -33472,26 +33680,26 @@ DebugStreamK: 0 DepthU: 64 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: false @@ -33500,80 +33708,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 3584 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114688 + LdsBytesNoAmax: 149504 LdsInitCVgprs: false - LdsNumBytes: 114688 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 32768 + LdsNumBytes: 149504 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 16384 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 74752 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 133120 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 + LdsOffsetMetadata: 58368 + LdsOffsetMetadata_Blk: 133120 + LdsPadA: 16 LdsPadB: 0 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -33582,29 +33790,27 @@ NonTemporalA: 4 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 7 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 8 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -33614,32 +33820,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 140 - SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33653,329 +33859,326 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - [2, 3, 0, 1] - - - [120, 256, 1, 8192] - - [44, 0.0] + - [133, 15.85] - - [128, 256, 1, 8192] - [0, 0.0] - - [128, 2440, 1, 8192] - - [46, 0.0] + - [120, 54.86] - - [128, 5120, 1, 8192] - - [135, 158872.0] + - [121, 68.21] - - [128, 5640, 1, 8192] - - [1, 0.0] + - [122, 70.77] - - [256, 120, 1, 8192] - - [136, 35579.9] + - [87, 35579.9] - - [256, 256, 1, 8192] - - [48, 0.0] + - [30, 0.0] - - [256, 512, 1, 8192] - - [2, 0.0] + - [1, 0.0] - - [256, 528, 1, 8192] - - [49, 0.0] + - [135, 37.1] - - [256, 2048, 1, 8192] - - [3, 0.0] + - [123, 72.57] - - [512, 120, 1, 8192] - - [137, 54795.5] + - [88, 54795.5] - - [512, 128, 1, 8192] - - [4, 0.0] + - [2, 0.0] - - [512, 256, 1, 8192] - - [5, 0.0] + - [3, 0.0] - - [512, 512, 1, 8192] - - [6, 0.0] + - [4, 0.0] - - [512, 528, 1, 8192] - - [50, 0.0] + - [31, 0.0] - - [512, 1980, 1, 8192] - - [51, 0.0] + - [32, 0.0] - - [512, 2048, 1, 8192] - - [7, 0.0] + - [5, 0.0] - - [528, 256, 1, 8192] - - [52, 0.0] + - [136, 36.39] - - [1024, 512, 1, 8192] - - [53, 0.0] + - [33, 0.0] - - [1980, 512, 1, 8192] - - [54, 0.0] + - [34, 0.0] - - [2048, 512, 1, 8192] - - [8, 0.0] + - [6, 0.0] - - [2820, 5640, 1, 8192] - - [9, 0.0] + - [7, 0.0] - - [3072, 512, 1, 8192] - - [56, 0.0] + - [36, 0.0] - - [3960, 512, 1, 8192] - - [57, 0.0] + - [37, 0.0] - - [4352, 128, 1, 8192] - - [10, 0.0] + - [126, 65.33] - - [4352, 256, 1, 8192] - - [58, 0.0] + - [38, 0.0] - - [4608, 256, 1, 8192] - - [59, 0.0] + - [39, 0.0] - - [5120, 128, 1, 8192] - - [11, 0.0] + - [127, 66.51] - - [5640, 128, 1, 8192] - - [12, 0.0] + - [128, 68.8] - - [5640, 2820, 1, 8192] - - [13, 0.0] + - [8, 0.0] - - [6912, 128, 1, 8192] - - [14, 0.0] + - [129, 71.97] - - [7296, 128, 1, 8192] - - [60, 0.0] + - [40, 0.0] - - [10880, 128, 1, 8192] - - [15, 0.0] + - [9, 0.0] - - [4, 128, 8192, 30] - - [16, 0.0] + - [10, 0.0] - - [16, 128, 8192, 33] - - [17, 0.0] + - [117, 6.95] - - [40, 128, 8192, 64] - - [18, 0.0] + - [118, 17.71] - - [128, 128, 1, 17711] - - [81, 0.0] + - [54, 0.0] - - [128, 960, 1, 17711] - - [83, 0.0] + - [125, 52.82] - - [128, 2480, 1, 17711] - - [84, 0.0] + - [124, 67.94] - - [128, 17711, 1, 41] - - [86, 0.0] + - [96, 10.05] - - [384, 17711, 1, 246] - - [109, 0.0] + - [93, 58.97] - - [384, 17711, 1, 768] - - [19, 0.0] + - [94, 80.41] - - [768, 96, 1, 17711] - - [114, 0.0] + - [72, 0.0] - - [887, 256, 1, 17711] - - [115, 0.0] + - [73, 0.0] - - [928, 128, 1, 17711] - - [116, 0.0] + - [131, 50.17] - - [2732, 384, 1, 17711] - - [122, 0.0] + - [130, 93.64] - - [28, 256, 1, 4096] - - [61, 0.0] + - [116, 2.75] - - [28, 320, 1, 4096] - - [62, 0.0] + - [112, 3.3] - - [64, 512, 1, 4096] - - [69, 0.0] + - [110, 10.53] - - [72, 256, 1, 4096] - - [71, 0.0] + - [46, 0.0] - - [72, 320, 1, 4096] - - [72, 0.0] + - [47, 0.0] - - [80, 512, 1, 4096] - - [73, 0.0] + - [105, 12.45] - - [96, 512, 1, 4096] - - [76, 0.0] + - [107, 14.97] - - [116, 256, 1, 4096] - - [77, 0.0] + - [50, 0.0] - - [116, 320, 1, 4096] - - [78, 0.0] + - [51, 0.0] - - [128, 2048, 1, 4096] - - [20, 0.0] + - [97, 46.7] - - [160, 512, 1, 4096] - - [21, 0.0] + - [115, 20.26] - - [180, 256, 1, 4096] - - [89, 0.0] + - [58, 0.0] - - [180, 320, 1, 4096] - - [90, 0.0] + - [59, 0.0] - - [256, 28, 1, 4096] - - [97, 0.0] + - [109, 2.75] - - [256, 72, 1, 4096] - - [98, 0.0] + - [106, 6.24] - - [256, 116, 1, 4096] - - [99, 0.0] + - [113, 9.6] - - [256, 180, 1, 4096] - - [103, 0.0] + - [108, 13.36] - - [256, 256, 1, 4096] - - [105, 0.0] + - [111, 18.23] - - [256, 7680, 1, 4096] - - [106, 0.0] + - [67, 0.0] - - [512, 160, 1, 4096] - - [111, 0.0] + - [114, 19.51] - - [512, 512, 1, 4096] - - [22, 0.0] + - [11, 0.0] - - [512, 2246, 1, 4096] - - [112, 0.0] + - [70, 0.0] - - [1600, 128, 1, 4096] - - [118, 0.0] + - [101, 36.46] - - [1824, 2048, 1, 4096] - - [23, 0.0] + - [12, 0.0] - - [2048, 57, 1, 4096] - - [120, 0.0] + - [75, 0.0] - - [2048, 64, 1, 4096] - - [121, 0.0] + - [76, 0.0] - - [2048, 82, 1, 4096] - - [24, 0.0] + - [103, 28.4] - - [2048, 160, 1, 4096] - - [25, 0.0] + - [102, 44.82] - - [2048, 2048, 1, 4096] - - [26, 0.0] + - [13, 0.0] - - [2246, 512, 1, 4096] - - [27, 0.0] + - [14, 0.0] - - [4132, 256, 1, 4096] - - [124, 0.0] + - [77, 0.0] - - [4132, 512, 1, 4096] - - [125, 0.0] + - [78, 0.0] - - [7680, 256, 1, 4096] - - [28, 0.0] + - [15, 0.0] - - [7680, 512, 1, 4096] - - [29, 0.0] + - [16, 0.0] - - [28, 32, 8192, 28] - - [126, 0.0] + - [79, 0.0] - - [32, 25, 8192, 25] - - [127, 0.0] + - [80, 0.0] - - [32, 64, 4096, 57] - - [128, 0.0] + - [81, 0.0] - - [32, 64, 4096, 82] - - [129, 0.0] + - [82, 0.0] - - [48, 160, 4096, 192] - - [30, 0.0] + - [17, 0.0] - - [48, 160, 4096, 642] - - [31, 0.0] + - [18, 0.0] - - [64, 200, 4096, 32] - - [130, 0.0] + - [83, 0.0] - - [160, 64, 96, 4096] - - [32, 0.0] + - [19, 0.0] - - [200, 64, 4096, 32] - - [33, 0.0] + - [119, 14.33] - - [8, 256, 1, 2048] - - [34, 0.0] + - [20, 0.0] - - [16, 256, 1, 2048] - - [35, 0.0] + - [21, 0.0] - - [32, 256, 1, 2048] - - [36, 0.0] + - [22, 0.0] - - [36, 256, 1, 2048] - - [37, 0.0] + - [23, 0.0] - - [40, 256, 1, 2048] - - [38, 0.0] + - [24, 0.0] - - [48, 256, 1, 2048] - - [39, 0.0] + - [25, 0.0] - - [64, 256, 1, 2048] - - [40, 0.0] + - [26, 0.0] - - [72, 256, 1, 2048] - - [41, 0.0] + - [27, 0.0] - - [80, 256, 1, 2048] - - [42, 0.0] + - [28, 0.0] - - [96, 256, 1, 2048] - - [132, 9584.86] + - [85, 9584.86] - - [128, 256, 1, 2048] - - [82, 0.0] + - [55, 0.0] - - [256, 128, 1, 2048] - - [101, 0.0] + - [64, 0.0] - - [256, 256, 1, 2048] - - [104, 0.0] + - [66, 0.0] - - [64, 128, 1, 8192] - - [43, 0.0] + - [132, 5.16] - - [128, 128, 1, 8192] - - [45, 0.0] + - [134, 9.41] - - [256, 128, 1, 98304] - - [47, 0.0] + - [29, 0.0] - - [1980, 1024, 1, 8192] - - [55, 0.0] + - [35, 0.0] - - [57, 32, 1, 262144] - - [63, 0.0] + - [137, 14.94] - - [64, 64, 1, 102400] - - [64, 0.0] + - [41, 0.0] - - [64, 64, 1, 131072] - - [65, 0.0] + - [42, 0.0] - - [64, 64, 1, 819200] - - [66, 0.0] + - [43, 0.0] - - [64, 128, 1, 1024] - - [67, 0.0] + - [104, 1.02] - - [64, 128, 1, 131072] - - [68, 0.0] + - [44, 0.0] - - [72, 128, 1, 1024] - - [70, 0.0] + - [45, 0.0] - - [82, 32, 1, 262144] - - [74, 0.0] + - [48, 0.0] - - [96, 128, 1, 1024] - - [75, 0.0] + - [49, 0.0] - - [128, 64, 1, 131072] - - [79, 0.0] + - [52, 0.0] - - [128, 128, 1, 1024] - - [80, 0.0] + - [53, 0.0] - - [128, 4096, 1, 1024] - - [85, 0.0] + - [90, 36.1] - - [128, 7456, 1, 1024] - - [138, 97916.7] + - [89, 49.79] - - [144, 128, 1, 1024] - - [87, 0.0] + - [56, 0.0] - - [160, 10, 1, 655360] - - [88, 0.0] + - [57, 0.0] - - [192, 48, 1, 655360] - - [91, 0.0] + - [60, 0.0] - - [192, 112, 1, 655360] - - [92, 0.0] + - [138, 61.51] - - [224, 64, 1, 527553] - - [93, 0.0] + - [139, 49.28] - - [224, 64, 1, 752863] - - [94, 0.0] + - [140, 52.03] - - [233, 56, 1, 131072] - - [95, 0.0] + - [61, 0.0] - - [252, 128, 1, 17711] - - [96, 0.0] + - [62, 0.0] - - [256, 128, 1, 1024] - - [100, 0.0] + - [63, 0.0] - - [256, 128, 1, 17711] - - [102, 0.0] + - [65, 0.0] - - [256, 7968, 1, 1024] - - [107, 0.0] + - [92, 65.32] - - [288, 64, 1, 806154] - - [108, 0.0] + - [68, 0.0] - - [512, 128, 1, 1024] - - [110, 0.0] + - [69, 0.0] - - [512, 2011, 1, 1024] - - [139, 100794.0] + - [91, 52.72] - - [642, 304, 1, 655360] - - [113, 0.0] + - [71, 0.0] - - [1024, 128, 1, 2048] - - [117, 0.0] + - [74, 0.0] - - [2011, 512, 1, 1024] - - [119, 0.0] + - [100, 50.39] - - [4096, 128, 1, 1024] - - [123, 0.0] + - [99, 32.27] - - [20, 48, 17711, 124] - - [133, 30249.3] + - [86, 30249.3] - - [128, 128, 6, 17711] - - [131, 0.0] + - [84, 0.0] - - [128, 17711, 6, 128] - - [134, 90258.2] + - [95, 49.6] - - [7968, 256, 1, 1024] - - [140, 135783.0] + - [98, 64.89] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml new file mode 100644 index 00000000000..06e33565602 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bjlk_S_MX_B_UserArgs.yaml @@ -0,0 +1,12714 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 0058] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DataTypeA: 0 + DataTypeAmaxD: 0 + DataTypeB: 0 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 10 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68GNy3DOANu_i2VXAB1Y2BeloAMEwWh68Ero33soSvMM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6wJcIFiIXcL9xRCcdjHOdxHUlrFCn9hAkB7Ii07m5Oeo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6MFyMQpdiVOcqKVaa7dbJIhJPdEmQ0U9Tqd7zT-vG3WY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT13u0IoyZA0H6JeQsxYx5dKpo7nQ9hoyYgsEHeT0aCVTI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3nYUFHmrSHNp-n-T71OSxpo2-QMWXfro-CzcYssiDH2w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 12288 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 114688 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 114688 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 384 + MacroTile1: 96 + MacroTileA: 384 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 12 + NumLoadsB: 3 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT384x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VJH_RBajY8L_kN5xvMaq7RQvJq3SsD2MoUk710Se8-I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 73728 + LdsInitCVgprs: false + LdsNumBytes: 73728 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 57344 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 73728 + LdsOffsetMetadata_Blk: 147456 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 448 + MacroTileA: 128 + MacroTileB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 4 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 7 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x448x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB7_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6rqfD3ghhAs2e9z1zuI_aBgcgETHE2YH-hIjPMwl5H1A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57344 + LdsInitCVgprs: false + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6M8k0zgCr79GezsF4koVKCwmzswSUcG00QTbkQGZOzc4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 + LdsInitCVgprs: false + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x32x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 98304 + LdsInitCVgprs: false + LdsNumBytes: 98304 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 81920 + LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x96x128_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59392 + LdsInitCVgprs: false + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26624 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 59392 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT64x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Yk7kOAJB_aE6VWqrVWf__KLZBXdHS4G4A_wz_1sHzCU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3to8cjWEs6y4SguQB5Eo9GwEWnTleft3p6-QhBINReWU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3eZ5RqKxB1O3236AeikNxnkmOhdKPC4D_VFkvtbj1K28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WumQmrRZO9w1nG8nVI-z2VGL_a-D9ZwkiDz46zvf_bQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT348doMK9QEbWRou0upuvJNPJpi_evI_jhEzrHZU7JHfw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53248 + LdsInitCVgprs: false + LdsNumBytes: 53248 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 20480 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 53248 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjgFvuyb2NeEcS1wEFnStawG246sqNqAes5-pKzM548= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3-LSPS0cmSEhfqGL2SuyEC3Fz2PkCkvfzxBANwy6XxN4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3JLacchZPwjeBP68mM8x4ppN98WdtQKDcx7Xp-zqDpyY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1F29g57gDKx143xeF4Ry8TpDfmB9904AahmpgslE7iFA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8192 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Va3I0gXvAjl6WSNm7VfyQxdSiay22E7f07A2gXBFahE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37888 + LdsInitCVgprs: false + LdsNumBytes: 37888 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 70656 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37888 + LdsOffsetMetadata_Blk: 70656 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Pg_BkawruRCL1XGe64CaI5c_MySTZSQHvn2xslbYUME= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49152 + LdsInitCVgprs: false + LdsNumBytes: 49152 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kiD3-1VtWxVcsUpwA9y_XMbufpwlDdmtaKQ75wO2y28= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yMCAGzp9nwTbEima2Un3PvZ0HXHBQIMvg1dFgzYWa_Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qduvuzsXPPzFHa_P72l7BR-VshFrlFZLdSkdePVgfp4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1LmAvRmupSNgrSah59TXAt88n3c4wrKF6OZh9XiWLgCg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1npxiDxAnSPzJZrvFG5oOPfnm5vME6tIZYSSv4aPUpfk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vIn45BtBGJJHgOeHh-5A8gVBnOI_bnqOc5L-OKq7gYA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24576 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1JYtzm5Xt3uGxXdOwg30qeZ_F0x7lSMwRDJAgKTAi_Gc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ljCbO7E0xnyQwh63y6jBAf9qB84fAz-ikx5KuEYBkO4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xO78SfH8mQcoIMGAjMin7gcWpXOlIKbkUU6oKtuOViE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT95yVDjp7dKdvEwAelX8U-m0LR7_oTUhNUc468WYqSMis= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 126976 + LdsInitCVgprs: false + LdsNumBytes: 126976 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 49152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 77824 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 77824 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 384 + MacroTileA: 96 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 3 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT96x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 3 + ThreadTileA: 48 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vlnE_4luus1zCUHcypOmdrR_sbwtiIA8OS0tI-q8Ru0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 40960 + LdsInitCVgprs: false + LdsNumBytes: 40960 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 40960 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 2 + ThreadTileA: 48 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1SDfMH-27Y7mEuNmRwG8kvOzYZofhtz1GHOgEQ1JLt60= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 32 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57344 + LdsInitCVgprs: false + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3kn1MecExrukXO0aoF0LQyTOFFiyvmj1lAoIdfg5BGCM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3FvQPs5Yo_G_QmVYzG6yS8nTnH_iGTUQcTqM20yP63Jg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 32 + LSCB: 16 + LSPA: 32 + LSPB: 64 + LVCA: 8 + LVCB: 4 + LVPA: 8 + LVPB: 16 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114688 + LdsInitCVgprs: false + LdsNumBytes: 114688 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1qHETGII6d0uSQZoEXMbPslbRwZtZ2qEIdMes7tMaKUM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 768 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 46080 + LdsInitCVgprs: false + LdsNumBytes: 46080 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 13312 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 46080 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT128x48x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB768_LBSPPM0_LPA0_LPB16_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB3_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4fXWWSVAh5Sx3ln-dukIdf9DQU0-ZZbM_XZiPevYyhHg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 111616 + LdsInitCVgprs: false + LdsNumBytes: 111616 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 78848 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 13312 + LdsOffsetMetadata_Blk: 78848 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT48x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB2048_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT30Yb2FxBYbeBYzDmCEQ-2xgTMHV7OuGTt-uS6NPWiiK4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VoHL4lZ09fFgLKb3kjhbcWCkAbytz4WOa5AXsyzVOTg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 163840 + LdsInitCVgprs: false + LdsNumBytes: 163840 + LdsNumElementsAlignedA: 49152 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 81920 + LdsOffsetB: 49152 + LdsOffsetB_Blk: 131072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49152 + LdsOffsetMetadata_Blk: 131072 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 4] + MIWaveTileA: 6 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 128 + MacroTileA: 192 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 12 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT192x128x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB2048_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 4 + ThreadTileA: 24 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT26xQYo7HSoTsdd4K1SIwcui68F8QKoN7vKJthhD9GJjQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 149504 + LdsInitCVgprs: false + LdsNumBytes: 149504 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74752 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 133120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 58368 + LdsOffsetMetadata_Blk: 133120 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT220m3qKLTrCkhh9vPv_Zxw_hfLycTgfLtMkglSrwNBmU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 149504 + LdsInitCVgprs: false + LdsNumBytes: 149504 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74752 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 133120 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 58368 + LdsOffsetMetadata_Blk: 133120 + LdsPadA: 16 + LdsPadB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 2] + MIWaveTileA: 7 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 64 + MacroTileA: 224 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 56 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 14 + NumLoadsB: 4 + NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_S_MX_B_UserArgs_MT224x64x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3584_LBSPPB1024_LBSPPM0_LPA16_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_2_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 2 + ThreadTileA: 28 + ThreadTileB: 2 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 7456, 1, 1024] + - [0, 49.79] + - - [128, 4096, 1, 1024] + - [1, 36.1] + - - [512, 2011, 1, 1024] + - [2, 52.72] + - - [256, 7968, 1, 1024] + - [3, 65.32] + - - [384, 17711, 1, 246] + - [4, 58.97] + - - [384, 17711, 1, 768] + - [5, 80.41] + - - [128, 17711, 6, 128] + - [6, 49.6] + - - [128, 17711, 1, 41] + - [7, 10.05] + - - [128, 2048, 1, 4096] + - [8, 46.7] + - - [7968, 256, 1, 1024] + - [9, 64.89] + - - [4096, 128, 1, 1024] + - [10, 32.27] + - - [2011, 512, 1, 1024] + - [11, 50.39] + - - [1600, 128, 1, 4096] + - [12, 36.46] + - - [2048, 160, 1, 4096] + - [13, 44.82] + - - [2048, 82, 1, 4096] + - [14, 28.4] + - - [64, 128, 1, 1024] + - [15, 1.02] + - - [80, 512, 1, 4096] + - [16, 12.45] + - - [256, 72, 1, 4096] + - [17, 6.24] + - - [96, 512, 1, 4096] + - [18, 14.97] + - - [256, 180, 1, 4096] + - [19, 13.36] + - - [256, 28, 1, 4096] + - [20, 2.75] + - - [64, 512, 1, 4096] + - [21, 10.53] + - - [256, 256, 1, 4096] + - [22, 18.23] + - - [28, 320, 1, 4096] + - [23, 3.3] + - - [256, 116, 1, 4096] + - [24, 9.6] + - - [512, 160, 1, 4096] + - [25, 19.51] + - - [160, 512, 1, 4096] + - [26, 20.26] + - - [28, 256, 1, 4096] + - [27, 2.75] + - - [16, 128, 8192, 33] + - [28, 6.95] + - - [40, 128, 8192, 64] + - [29, 17.71] + - - [200, 64, 4096, 32] + - [30, 14.33] + - - [128, 2440, 1, 8192] + - [31, 54.86] + - - [128, 5120, 1, 8192] + - [32, 68.21] + - - [128, 5640, 1, 8192] + - [33, 70.77] + - - [256, 2048, 1, 8192] + - [34, 72.57] + - - [128, 2480, 1, 17711] + - [35, 67.94] + - - [128, 960, 1, 17711] + - [36, 52.82] + - - [4352, 128, 1, 8192] + - [37, 65.33] + - - [5120, 128, 1, 8192] + - [38, 66.51] + - - [5640, 128, 1, 8192] + - [39, 68.8] + - - [6912, 128, 1, 8192] + - [40, 71.97] + - - [2732, 384, 1, 17711] + - [41, 93.64] + - - [928, 128, 1, 17711] + - [42, 50.17] + - - [64, 128, 1, 8192] + - [43, 5.16] + - - [120, 256, 1, 8192] + - [44, 15.85] + - - [128, 128, 1, 8192] + - [45, 9.41] + - - [256, 528, 1, 8192] + - [46, 37.1] + - - [528, 256, 1, 8192] + - [47, 36.39] + - - [57, 32, 1, 262144] + - [48, 14.94] + - - [192, 112, 1, 655360] + - [49, 61.51] + - - [224, 64, 1, 527553] + - [50, 49.28] + - - [224, 64, 1, 752863] + - [51, 52.03] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index 69c681747d5..c44ee7e3e4b 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -78,10 +78,11 @@ UseScaleAB: '' UseScaleAlphaVec: 1 UseScaleCD: false -- - 1LDSBuffer: 1 +- - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -89,7 +90,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3I0wGX9OCQytvVFG093CjdC3Fh11WJS7aHB6fbVAR3uo= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32i2_U0UMyp0MD6JrbAMSGAM4jMSqovGlng0TkZUPkdbU= BufferLoad: true BufferStore: true CUCount: null @@ -113,13 +114,13 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -131,36 +132,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPA: 4 + LSPB: 32 + LVCA: 64 LVCB: 8 LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 0 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 16384 + LdsBytesNoAmax: 117248 LdsInitCVgprs: false - LdsNumBytes: 16384 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8192 + LdsNumBytes: 117248 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 - LdsPadB: 0 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -179,15 +180,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -207,23 +208,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -240,7 +241,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA4_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 @@ -249,29 +250,29 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 64 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 64 ThreadTileB: 2 - TransposeLDS: 0 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -283,16 +284,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -318,6 +319,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -325,7 +327,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x80x64_MI16xRVFK1XBidnBlbpNmd3CrBE33uuI0SAPL3Qyw7W8WneM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x48x32_MI16xkqGY-NF7CAM3EskAJ122WE1zJWH2MocSSQwjfR9WI0Q= BufferLoad: true BufferStore: true CUCount: null @@ -335,7 +337,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -350,7 +352,7 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -367,34 +369,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB5_NTC2_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 128 - LSCB: 64 + LSCB: 32 LSPA: 8 LSPB: 16 LVCA: 32 LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 55808 + LdsBytesNoAmax: 24064 LdsInitCVgprs: false - LdsNumBytes: 55808 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 24064 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 55808 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 24064 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -404,8 +406,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -416,14 +418,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [2, 5] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 5 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 80 + MacroTile1: 48 MacroTileA: 128 - MacroTileB: 80 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -443,22 +445,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 5 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -467,7 +469,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -476,21 +478,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA3_NTB5_NTC2_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -499,9 +501,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 5 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 5 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -532,10 +534,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -544,16 +546,17 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -561,7 +564,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI32i2_U0UMyp0MD6JrbAMSGAM4jMSqovGlng0TkZUPkdbU= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16xDKRM6tGNXg9sJhCFV85LAtvtZ9sqZSu-_qtnT-ApUoA= BufferLoad: true BufferStore: true CUCount: null @@ -585,15 +588,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -603,36 +606,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 16 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117248 + LdsBytesNoAmax: 20992 LdsInitCVgprs: false - LdsNumBytes: 117248 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 20992 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 35328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 20992 + LdsOffsetMetadata_Blk: 35328 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -640,11 +643,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -652,24 +655,24 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 128 + MacroTileA: 16 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false @@ -679,22 +682,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -703,7 +706,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -712,31 +715,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB1_NTC5_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 64 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true @@ -755,16 +758,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -777,19 +780,20 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -797,7 +801,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x96x32_MI32x3liFjowZ1wszbsRAx60Qyy1bjIqtt9hLU0sWalgTr5k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI32xRnvoYb67eaAsgNwXE0Dn8TOG2hIUS2JkKOOoDbvvmi8= BufferLoad: true BufferStore: true CUCount: null @@ -808,9 +812,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -839,34 +843,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61632 + LdsBytesNoAmax: 22528 LdsInitCVgprs: false - LdsNumBytes: 61632 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 12480 + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 36864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 36864 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -874,8 +878,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false @@ -887,15 +891,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -909,28 +913,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 3 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -948,32 +952,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA2_NTB1_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 3 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 3 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -998,9 +1002,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1026,6 +1030,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1033,7 +1038,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x48x32_MI16xkqGY-NF7CAM3EskAJ122WE1zJWH2MocSSQwjfR9WI0Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x16x32_MI16x1ZSjm3pNsBWpJBqQ8vtHYNovksGMavhsTPTpgiIsoQJA= BufferLoad: true BufferStore: true CUCount: null @@ -1058,7 +1063,7 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -1075,34 +1080,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 8 LSPB: 16 - LVCA: 32 - LVCB: 16 + LVCA: 16 + LVCB: 8 LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 2048 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24064 + LdsBytesNoAmax: 10752 LdsInitCVgprs: false - LdsNumBytes: 24064 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 7680 + LdsNumBytes: 10752 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24064 - LdsOffsetMetadata_Blk: 49152 + LdsOffsetMetadata: 10752 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -1115,7 +1120,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -1123,15 +1128,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 3] + MIWaveGroup: [2, 1] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 48 - MacroTileA: 128 - MacroTileB: 48 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1151,23 +1156,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 6 NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 3 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -1184,32 +1189,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 3 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1234,9 +1239,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1258,10 +1263,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1269,12 +1275,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x128x32_MI16xDKRM6tGNXg9sJhCFV85LAtvtZ9sqZSu-_qtnT-ApUoA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32CJXo5F_EnC1qtNw819HS7RB4clh5fpaFNqA3ndpCH4M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -1293,15 +1299,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -1311,36 +1317,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 + LSCA: 256 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 - LVPA: 16 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 20992 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 20992 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 35328 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20992 - LdsOffsetMetadata_Blk: 35328 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -1348,35 +1354,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 128 - MacroTileA: 16 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -1387,22 +1393,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 7 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1411,7 +1417,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1420,17 +1426,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -1442,10 +1448,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1463,14 +1469,14 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 @@ -1494,10 +1500,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1505,12 +1512,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI32xRnvoYb67eaAsgNwXE0Dn8TOG2hIUS2JkKOOoDbvvmi8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16GRSbzQx7ighSVLVcaqdzxzzKPcqoRCQBgH1GHRJJ7XM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -1529,15 +1536,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -1547,36 +1554,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 256 LSCB: 32 - LSPA: 32 + LSPA: 1 LSPB: 32 - LVCA: 8 + LVCA: 256 LVCB: 8 - LVPA: 8 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22528 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 22528 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 36864 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22528 - LdsOffsetMetadata_Blk: 36864 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -1584,35 +1591,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -1624,21 +1631,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1647,7 +1654,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1656,38 +1663,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1699,16 +1706,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 48 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1721,7 +1728,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -1730,10 +1737,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1741,7 +1749,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x16x32_MI16x1ZSjm3pNsBWpJBqQ8vtHYNovksGMavhsTPTpgiIsoQJA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI160W_L8MAdvSH6iFJ8r3bD7tGc-qFAzcgrXURTRj3-VzI= BufferLoad: true BufferStore: true CUCount: null @@ -1765,15 +1773,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -1783,35 +1791,35 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 256 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 16 + LSPA: 1 + LSPB: 32 + LVCA: 256 LVCB: 8 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 10752 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 10752 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10752 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -1823,7 +1831,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -1831,15 +1839,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1859,23 +1867,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 2 NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -1892,38 +1900,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1935,16 +1943,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1957,7 +1965,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -1970,6 +1978,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1977,7 +1986,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI32CJXo5F_EnC1qtNw819HS7RB4clh5fpaFNqA3ndpCH4M= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x1UDSP19TYSaBbDZT8LdxllcxelGKRB4v4VgMVknXSJ8Q= BufferLoad: true BufferStore: true CUCount: null @@ -1987,10 +1996,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2001,15 +2010,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2019,48 +2028,48 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 1 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -2068,49 +2077,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2128,38 +2137,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA7_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 4 - ThreadTileA: 64 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2171,29 +2180,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -2206,6 +2215,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2213,7 +2223,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xXABdh3bk2Jkiez4K4X68a3AY03PVVtvBtc_eW-i0fnQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI322Jt2ItX6xZZLSMKHsF4w0CbdGttb9SNJ6Nhz0_HK7WU= BufferLoad: true BufferStore: true CUCount: null @@ -2223,7 +2233,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -2237,15 +2247,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2255,72 +2265,72 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC2_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 43008 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -2332,21 +2342,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2355,7 +2365,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2364,38 +2374,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB2_NTC2_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2407,33 +2417,33 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -2442,6 +2452,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2449,20 +2460,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x64_MI16x1jE4wPYzIuURSj0o-J_xbjWAp2HQMeWqPCzVThoOuG2Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16wN6A5rnXqv-rHoqnpsJkJScWXV4931PtcBVxS-sQP-A= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2473,15 +2484,15 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -2491,45 +2502,45 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 8 - LVCB: 16 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 256 + LSCB: 32 + LSPA: 1 + LSPB: 32 + LVCA: 256 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28800 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 28800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 24576 - LdsPadA: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -2539,15 +2550,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2561,29 +2572,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 4 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 3 + NonTemporalD: 2 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -2591,7 +2602,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2600,38 +2611,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC2_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2643,29 +2654,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 @@ -2674,10 +2685,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2685,7 +2697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1bUjbUEv62p-Fld3AMB-QrB1lplQAKd-eS8s8Ga4rEis= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xHp8Jp1EN-BsW4NNHKXj8wdhUGyxjiO0ZBU36jrCBic4= BufferLoad: true BufferStore: true CUCount: null @@ -2695,7 +2707,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -2708,66 +2720,67 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB4_NTC6_NTD6_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 17408 + LdsBytesNoAmax: 60416 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 60416 + LdsNumElementsAlignedA: 10240 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 43008 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 10240 + LdsOffsetMetadata_Blk: 43008 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -2776,14 +2789,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2804,22 +2817,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -2827,7 +2840,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2836,21 +2849,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB4_NTC6_NTD6_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 4 SubGroup1: 32 SubGroupA: 4 @@ -2858,10 +2871,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2871,49 +2884,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2921,20 +2937,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xpJUyFtapYfNNS6S-_mCC2LXpy2KwwUrl2zn0psf2XY4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16pW7zOSW31Upy1wP-ePzJYYSpctNFPpPLDnJHKj_nfEw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2944,14 +2960,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -2960,48 +2977,48 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC1_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115200 - LdsInitCVgprs: false - LdsNumBytes: 115200 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 117760 + LdsInitCVgprs: false + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3011,15 +3028,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3033,28 +3050,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 3 + NonTemporalB: 4 + NonTemporalC: 6 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3063,7 +3080,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3072,7 +3089,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC1_NTD4_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3081,75 +3098,78 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3157,20 +3177,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1vVMzBO4mWqfB_Kye-esgwFbKcDpne8Tb620dLprWVBs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16VBp-QCoHOpL6LyENWcMSpXtCN0fwPKVG3tklVSFxaog= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3180,14 +3200,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -3196,48 +3217,48 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57600 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 57600 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3247,15 +3268,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3269,28 +3290,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3299,7 +3320,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3308,68 +3329,69 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -3378,14 +3400,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3393,12 +3417,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x128x32_MI32x--WTD2tEyvvP6yC7EC_hWmqA4rOL1W_DJqydY3VGIMM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xP3aS4yEsKaJ_WgMEwdb05dsMR2_LceuYOMf1OYI2fWk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -3413,58 +3437,59 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB5_NTC1_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 63488 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 63488 - LdsNumElementsAlignedA: 12288 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 12288 - LdsOffsetB_Blk: 45056 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12288 - LdsOffsetMetadata_Blk: 45056 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -3472,11 +3497,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -3484,23 +3509,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -3511,22 +3536,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 12 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3534,8 +3559,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3544,42 +3569,43 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB5_NTC1_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 1 - ThreadTileA: 48 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -3587,16 +3613,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3614,14 +3640,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3629,7 +3657,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x17euTeWN6jMQ7FVMh1Ll2e-mPEhVuyHHM5E6MJDowG_k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16wI1VAKWPNdP9RHMqPy0EQsFIoAsqd-rof14TfmS3B7w= BufferLoad: true BufferStore: true CUCount: null @@ -3652,53 +3680,54 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 + LSCA: 128 LSCB: 64 LSPA: 8 LSPB: 16 LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -3711,7 +3740,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -3719,15 +3748,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3747,22 +3776,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 + NonTemporalA: 2 + NonTemporalB: 4 NonTemporalC: 7 - NonTemporalD: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3780,32 +3809,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -3815,24 +3844,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3841,7 +3871,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -3850,14 +3880,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3865,20 +3897,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x19rPAmQ_VqFXH145RBghSm4KBn8pkGE6YhVYK32za2qc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32UbnKmTyiavN2akJm6VmZtXlrWfhYQJP2E3bwh8fwpA8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3888,67 +3920,68 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC6_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 LSCB: 64 LSPA: 8 LSPB: 16 LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -3956,49 +3989,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 + NonTemporalA: 2 + NonTemporalB: 1 NonTemporalC: 6 - NonTemporalD: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4016,7 +4049,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB5_NTC6_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4024,23 +4057,23 @@ StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 32 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 32 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -4051,24 +4084,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4077,23 +4111,25 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4101,20 +4137,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x128_MI16xcL6rbzCmMfe4R9bfmmNjYN2-TCjnsrf0xnm-fnL21cg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32AdmC3-P-DJd8Y5s3WQ6Z37Tt-2n6SRcuWFce7LCusPk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4124,117 +4160,118 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB7_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 - LDSTrInst: 0 - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 2048 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 132096 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 0 - LdsPadB: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 2 + NonTemporalB: 2 + NonTemporalC: 5 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 16 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4252,84 +4289,87 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB7_NTC2_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4337,7 +4377,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x10ZIxmIfFOLgg0guqZLCypFSHKQGFlah6vEK6DSCcuO0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32mu0WQNBr6Bcz0RHbPND4cTi3CAGuaMu12Tv6aUvNdvM= BufferLoad: true BufferStore: true CUCount: null @@ -4347,7 +4387,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -4360,55 +4400,56 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 64 - LSPA: 64 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 4 - LdsBlockSizePerPadA: 768 + LSCA: 256 + LSCB: 32 + LSPA: 1 + LSPB: 32 + LVCA: 256 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31744 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 13312 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13312 - LdsOffsetB_Blk: 46080 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31744 - LdsOffsetMetadata_Blk: 46080 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4417,34 +4458,34 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4455,21 +4496,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 7 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 32 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -4488,7 +4529,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB2_NTC0_NTD1_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4497,12 +4538,12 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4510,20 +4551,21 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 2 + ThreadTileA: 64 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -4531,23 +4573,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -4558,14 +4600,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4573,12 +4617,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32uyg_KJ6W0X4wFR39DerojqVuONJUcgopuasEX7ENoHM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI327VropPoEGPK4p81XGOPYK7cGqJ9bMn-cBbte0oLvQa0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -4596,54 +4640,55 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 LSCB: 32 - LSPA: 16 + LSPA: 1 LSPB: 32 - LVCA: 16 + LVCA: 256 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 116736 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 24576 + LdsNumBytes: 116736 + LdsNumElementsAlignedA: 33792 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90112 - LdsPadA: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -4664,13 +4709,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 + MacroTile0: 256 MacroTile1: 128 - MacroTileA: 192 + MacroTileA: 256 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -4691,21 +4736,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 32 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 32 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -4724,21 +4769,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB0_NTC7_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4746,28 +4791,29 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 64 ThreadTile1: 2 - ThreadTileA: 48 + ThreadTileA: 64 ThreadTileB: 2 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -4775,8 +4821,8 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4792,16 +4838,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4809,7 +4857,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI321Jlp9ISKkogci-xniY90jo1KyQxQOK1ONDpQkef74Xc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x64_MI16xjyuL3eEnd6oM60MxKkSoj12f2_Rm9Y651748TEsy1cM= BufferLoad: true BufferStore: true CUCount: null @@ -4819,7 +4867,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -4832,6 +4880,7 @@ ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -4848,39 +4897,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB1_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41984 + LdsBytesNoAmax: 69120 LdsInitCVgprs: false - LdsNumBytes: 41984 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 69120 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 144896 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41984 - LdsOffsetMetadata_Blk: 90112 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 69120 + LdsOffsetMetadata_Blk: 144896 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4889,34 +4938,34 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 48 + MacroTile1: 192 + MacroTileA: 48 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4927,22 +4976,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 5 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 12 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4960,7 +5009,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB1_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4969,12 +5018,12 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -4982,20 +5031,21 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -5004,22 +5054,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5030,14 +5080,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5045,7 +5097,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x32_MI32aSmQIWyuX0F6x-_O4_1rqWaHuuYojs7JtkOyu6uFmvo= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x12XhFbXZExtNclf8zD5hKevb_buILS5setEE10T0PVPQ= BufferLoad: true BufferStore: true CUCount: null @@ -5055,7 +5107,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -5065,12 +5117,13 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -5084,39 +5137,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 2 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 39424 + LdsBytesNoAmax: 13824 LdsInitCVgprs: false - LdsNumBytes: 39424 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 13824 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 39424 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 13824 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -5125,34 +5178,34 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 5] + MIWaveGroup: [1, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 160 - MacroTileA: 128 - MacroTileB: 160 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -5164,29 +5217,29 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 5 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 5 - NumThreads: 256 + NumLoadsPerpendicularB: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -5196,32 +5249,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_5_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM32_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5231,33 +5284,34 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -5266,14 +5320,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5281,7 +5337,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xjO4OfkKCkFkgvj91kWXTcGRxZ3AGGPMy8KK_MmH8an4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x32_MI32xRAh7ppQBNzWRsmJpnL4jlXiT-M-iht9O9UTSpXb7f7s= BufferLoad: true BufferStore: true CUCount: null @@ -5304,8 +5360,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -5313,44 +5370,44 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 LVPA: 4 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25600 + LdsBytesNoAmax: 35840 LdsInitCVgprs: false - LdsNumBytes: 25600 + LdsNumBytes: 35840 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 27648 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 + LdsOffsetA_Blk: 65536 LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 35840 + LdsOffsetMetadata_Blk: 73728 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -5372,14 +5429,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] + MIWaveTile: [1, 3] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 192 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5399,22 +5456,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalC: 5 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5432,12 +5489,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA1_NTB0_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -5455,9 +5512,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5467,16 +5524,17 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -5484,7 +5542,7 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5493,7 +5551,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -5502,14 +5560,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5517,20 +5577,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1-Fpr6LObdZOar2VTNmoL-twRh0ajwoRc5WwTKhLMmRI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xCKUpZ9P25YvYGJ1Acu5q1cb_ng8B3vHgEyiubZYN-vs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5540,66 +5600,67 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 41472 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5607,15 +5668,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5629,28 +5690,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5668,59 +5729,60 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 8 + ThreadTileA: 16 ThreadTileB: 2 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5729,7 +5791,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -5738,14 +5800,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5753,7 +5817,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3Nv5gocJwu5_NWJBdSETiqKu7PzRDhU09DdXveu1VomQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xQ9WCQXg3kci4s1k-N1_jON3rjIc87HNykcNj6r53DuQ= BufferLoad: true BufferStore: true CUCount: null @@ -5763,7 +5827,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -5776,8 +5840,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -5785,82 +5850,82 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 98816 LdsInitCVgprs: false - LdsNumBytes: 49280 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 98816 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -5871,22 +5936,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 1 + NonTemporalC: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5904,8 +5969,8 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -5920,15 +5985,15 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -5939,12 +6004,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -5954,18 +6020,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -5974,14 +6040,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5989,7 +6057,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x34cnnDEidhRk_WmWhuWyOvEKluncEq7Gtf72_AeBCIx4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3t8AEDJh-qtEP0j1P4XwXYzISUBJp3m20CgLweU5bvQ0= BufferLoad: true BufferStore: true CUCount: null @@ -5999,7 +6067,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -6012,43 +6080,44 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 49664 LdsInitCVgprs: false - LdsNumBytes: 49280 + LdsNumBytes: 49664 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -6063,13 +6132,13 @@ LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 1 + LoopUnroll: 16 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -6079,15 +6148,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6107,22 +6176,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 4 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6131,7 +6200,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6140,13 +6209,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -6154,11 +6223,11 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -6175,7 +6244,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -6190,16 +6260,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 2, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -6210,14 +6280,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6225,90 +6297,91 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3tlqjbYGbF8D3dhyvwAIN6W4godSGJACkw3uFX8nzkHg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16XOuZu64IcidHMClamzevEn0u2nxp4pZshzhk7HOLl9k= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 128 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 49280 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -6316,49 +6389,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalB: 3 + NonTemporalC: 4 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6366,8 +6439,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6376,57 +6449,58 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 26 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 @@ -6437,23 +6511,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6461,20 +6537,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3RKQpWWg-CKz1NtcjvnA_BMu7gJCV-SEO9UGnUQ_IcX4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x256x32_MI16x8aOTHAb3oJ24lDR1GY5QlgCdE9ISN9B4sNQoCcICvs4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6484,6 +6560,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -6500,101 +6577,101 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98560 + LdsBytesNoAmax: 47616 LdsInitCVgprs: false - LdsNumBytes: 98560 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 47616 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 47616 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 7 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6603,7 +6680,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6612,21 +6689,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 27 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB7_NTC7_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -6634,62 +6711,65 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6697,12 +6777,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x128x32_MI32xCgBo5HHzliWlZq-opMQMpMxU6BF3PqWo19md9_NsQAs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x64x32_MI16xZMetFWyGzjhHcbZJekE_6F7yBfklY-t1A7lJBhtt1oY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -6717,9 +6797,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -6736,39 +6817,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 3072 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 25088 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25088 + LdsOffsetB_Blk: 90624 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 46592 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 25088 + LdsOffsetMetadata_Blk: 90624 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -6776,35 +6857,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [3, 4] MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 64 + MacroTileA: 192 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -6816,21 +6897,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 2 NumElementsPerThread: 48 NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 3 - NumLoadsB: 4 + NumLoadsA: 6 + NumLoadsB: 2 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6838,7 +6919,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -6848,42 +6929,43 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 28 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 1 - ThreadTileA: 48 - ThreadTileB: 1 - TransposeLDS: 2 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 4 + ThreadTileA: 12 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -6892,15 +6974,15 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -6913,19 +6995,21 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6933,7 +7017,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xxizCevnuKDk9O05gW7EGa90Gqudceas3oEQNoKTWBLM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xSDxhcZ3u4iSYJMHirZW1CTHGblCpuWSDzLc_0f9fOAo= BufferLoad: true BufferStore: true CUCount: null @@ -6943,10 +7027,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6956,8 +7040,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -6965,44 +7050,44 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB2_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 8 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58368 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 58368 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -7010,12 +7095,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7023,15 +7108,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [4, 1] MIWaveTile: [1, 2] MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7045,16 +7130,16 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalA: 2 + NonTemporalB: 0 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 @@ -7084,13 +7169,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 29 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB2_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -7099,10 +7184,10 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -7119,7 +7204,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -7134,16 +7220,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7152,16 +7238,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7169,12 +7257,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xC6C0cUysf9CdYeZyDKzbyav_VZxNQyzLFs--R93kbs0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16vi3ZztTU2ZooOHejWDnWTKb_gBIj8TaDNK9DtMbhZNs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -7192,55 +7280,56 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 256 LSCB: 32 - LSPA: 8 + LSPA: 1 LSPB: 32 - LVCA: 32 + LVCA: 256 LVCB: 8 - LVPA: 2 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -7248,35 +7337,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -7287,22 +7376,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalA: 1 + NonTemporalB: 7 NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7311,7 +7400,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7320,17 +7409,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 30 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -7342,37 +7431,38 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7390,14 +7480,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7405,7 +7497,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xXKBoXMSxWcqHlgfib00ccN-Wi64fvb6VNinYeZxgXnk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3Yo9YdRBKBj79FhGiWwDokhFsmBdvtYGmzRh04XflZAM= BufferLoad: true BufferStore: true CUCount: null @@ -7415,7 +7507,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -7428,6 +7520,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7444,37 +7537,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 49280 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 49280 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -7484,8 +7577,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] @@ -7495,14 +7588,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -7523,22 +7616,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 4 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 6 NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7556,32 +7649,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 31 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7591,33 +7684,34 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -7626,14 +7720,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7641,20 +7737,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI326WtBLaA2aXraePY7SVyMDed3tQL9zRIEHPEESGlauVA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x160x32_MI16xPdajT8YH9ob0TRWzD1ldbfXmsBG0Yb-qwhjBqEUffYg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7664,6 +7760,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7680,80 +7777,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB3_NTC7_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 1536 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 132096 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 132096 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 12800 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66048 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 12800 + LdsOffsetB_Blk: 78336 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98816 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 78336 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 4] - MIWaveTileA: 1 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 96 + MacroTile1: 160 + MacroTileA: 96 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -7761,20 +7858,20 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalC: 0 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 5 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7783,7 +7880,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7792,21 +7889,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 32 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB3_NTC7_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -7814,10 +7911,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7827,7 +7924,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -7836,22 +7934,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7862,14 +7960,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7877,7 +7977,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16GRSbzQx7ighSVLVcaqdzxzzKPcqoRCQBgH1GHRJJ7XM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16D6x4dco45Qa1J1WWaOfvrVhnUVbkL6MHiGgtcXWPe0g= BufferLoad: true BufferStore: true CUCount: null @@ -7897,57 +7997,58 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 256 + LSCA: 128 LSCB: 32 - LSPA: 1 + LSPA: 8 LSPB: 32 - LVCA: 256 + LVCA: 32 LVCB: 8 - LVPA: 1 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -7967,15 +8068,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7995,22 +8096,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 1 + NonTemporalB: 2 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8018,7 +8119,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -8028,59 +8129,60 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 33 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 8 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8089,23 +8191,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8113,12 +8217,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI324F5sj-erxwaJrIU1muQSzHuxqiGNLO18cI0KvoEyRVM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16g49CyeI2eVWDPXKcFHBWWzGtiVFxp9FPEDK9D-TTnuI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -8133,9 +8237,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8152,39 +8257,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 2560 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 20992 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 20992 + LdsOffsetB_Blk: 86528 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 86528 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8192,35 +8297,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 4] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 160 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 160 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8231,21 +8336,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 1 NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalC: 1 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 5 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -8254,8 +8359,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8264,7 +8369,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 34 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -8273,7 +8378,7 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 4 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 @@ -8286,9 +8391,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 20 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 20 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -8299,7 +8404,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8314,9 +8420,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8334,14 +8440,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8349,12 +8457,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI327AlSQH3iAJryoRlI_pnDbEaZckx7rs5nEeQh0g5BFdk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1A-I78gXXrdOMKV_Cgj6SLbZMW0uS0cuPNCPj32U4D4A= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -8369,9 +8477,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8379,7 +8488,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -8388,39 +8497,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 0 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 9216 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 9216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -8428,35 +8537,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -8467,31 +8576,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 6 + NonTemporalA: 6 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8500,7 +8609,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 35 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC5_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -8509,22 +8618,22 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 4 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -8535,24 +8644,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8561,7 +8671,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -8570,14 +8680,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8585,7 +8697,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI160W_L8MAdvSH6iFJ8r3bD7tGc-qFAzcgrXURTRj3-VzI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1jVgAbVl7TAJozgubbJrw62tnMG_eow8t5K7k9PPC4WA= BufferLoad: true BufferStore: true CUCount: null @@ -8608,54 +8720,55 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 32 LSCB: 32 - LSPA: 1 - LSPB: 32 - LVCA: 256 + LSPA: 8 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 1 - LVPB: 8 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 8704 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -8667,7 +8780,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -8675,15 +8788,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8703,23 +8816,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 2 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 + NonTemporalA: 7 + NonTemporalB: 2 + NonTemporalC: 4 + NonTemporalD: 7 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -8727,7 +8840,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8736,59 +8849,60 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 36 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB1_NTC6_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8797,23 +8911,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8821,7 +8937,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT144x256x32_MI16zGLdUFppgmxwRtFeiiI5CTCUno1nocQnuy352ZtGY3g= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1TJ8fhKzIV0bLTU9VMa_LELZqPo5QayYI3rGq2ReOzWM= BufferLoad: true BufferStore: true CUCount: null @@ -8844,14 +8960,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -8860,37 +8977,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123392 + LdsBytesNoAmax: 14848 LdsInitCVgprs: false - LdsNumBytes: 123392 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 14848 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 10240 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 88576 + LdsOffsetMetadata: 14848 + LdsOffsetMetadata_Blk: 20992 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8903,7 +9020,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -8912,14 +9029,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [9, 4] - MIWaveTileA: 9 - MIWaveTileB: 4 + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 144 - MacroTile1: 256 - MacroTileA: 144 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8939,22 +9056,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 144 - NumLoadsA: 18 - NumLoadsB: 32 - NumLoadsCoalescedA: 9 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8963,7 +9080,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8972,21 +9089,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 37 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT144x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT9_4_MO40_NTn1_NTA1_NTB1_NTC0_NTD0_NTM0_NEPBS0_NLCA9_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 SubGroup1: 64 SubGroupA: 4 @@ -8994,10 +9111,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 36 - ThreadTile1: 4 - ThreadTileA: 36 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9007,16 +9124,17 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -9024,7 +9142,7 @@ WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9033,23 +9151,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9057,7 +9177,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x256x32_MI32tdCNzBgk3lUDksKVviFzMyoUyHh69f9kY3FwffgwOyg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x32_MI16x1hlDZ2RQXNBkW0XTWcWRLJPdhG2QWccTGpKZ8d7Cm7gs= BufferLoad: true BufferStore: true CUCount: null @@ -9080,6 +9200,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -9087,7 +9208,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9096,39 +9217,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 32 LSPA: 4 - LSPB: 32 - LVCA: 64 + LSPB: 16 + LVCA: 32 LVCB: 8 LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 126976 + LdsBytesNoAmax: 19456 LdsInitCVgprs: false - LdsNumBytes: 126976 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 19456 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 36864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 19456 + LdsOffsetMetadata_Blk: 36864 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -9136,35 +9257,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 256 - MacroTileA: 192 - MacroTileB: 256 + MacroTile0: 32 + MacroTile1: 96 + MacroTileA: 32 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9175,23 +9296,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 + NonTemporalA: 3 + NonTemporalB: 6 NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 192 - NumGlobalWriteVectorsPerThread: 192 - NumLoadsA: 24 - NumLoadsB: 8 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularB: 6 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9199,7 +9320,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9208,59 +9329,60 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 38 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA2_NTB2_NTC4_NTD7_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 4 - ThreadTileA: 48 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9269,23 +9391,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9293,7 +9417,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x32x32_MI32xVzdnSYsGhQaZbelX-CANSXeu3v1q-VFJ5TDW5AP_IiM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x32_MI16xiJ6qFagB646z5Vtl5Hvm-aiCfDXZyxNPc89xWsDkpZY= BufferLoad: true BufferStore: true CUCount: null @@ -9305,7 +9429,7 @@ DebugStreamK: 0 DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false @@ -9316,8 +9440,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -9325,108 +9450,108 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 128 + LSCA: 16 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 16 LVPB: 8 - LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadA: 768 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 53312 + LdsBytesNoAmax: 64768 LdsInitCVgprs: false - LdsNumBytes: 53312 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 4160 + LdsNumBytes: 64768 + LdsNumElementsAlignedA: 6656 + LdsNumElementsAlignedB: 25344 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 49152 + LdsOffsetB: 6656 + LdsOffsetB_Blk: 39424 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 49152 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 6656 + LdsOffsetMetadata_Blk: 39424 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 48 + MacroTile1: 192 + MacroTileA: 48 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 0 NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalC: 6 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9435,7 +9560,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9444,13 +9569,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA6_NTB5_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 @@ -9459,17 +9584,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9479,12 +9604,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -9494,9 +9620,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9505,7 +9631,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9514,14 +9640,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9529,29 +9657,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x1UDSP19TYSaBbDZT8LdxllcxelGKRB4v4VgMVknXSJ8Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x384x32_MI16xxRLiYtk8vs0qL-PXaqBrMaldMaCGmntAX1C6Jqy-srA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 @@ -9568,50 +9697,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 62976 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 62976 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 73216 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 62976 + LdsOffsetMetadata_Blk: 73216 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -9619,15 +9748,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 6] + MIWaveTileA: 3 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 48 + MacroTile1: 384 + MacroTileA: 48 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9641,28 +9770,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 1 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 72 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9670,8 +9799,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9680,7 +9809,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC5_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -9689,38 +9818,39 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 4 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 12 + ThreadTile1: 6 + ThreadTileA: 12 + ThreadTileB: 6 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -9730,18 +9860,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9750,14 +9880,16 @@ enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9765,20 +9897,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI322Jt2ItX6xZZLSMKHsF4w0CbdGttb9SNJ6Nhz0_HK7WU= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3tgew_wmZ1d78svsdgjmhnAmiMolGNWzJ-VQ5Kmkq7s0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -9788,14 +9920,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -9804,50 +9937,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 98560 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 98560 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -9855,15 +9988,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 256 - MacroTileA: 128 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -9877,29 +10010,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: 4 + NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -9916,84 +10051,90 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB7_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 4 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 - TransposeLDSMetadata: true + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10001,17 +10142,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16wN6A5rnXqv-rHoqnpsJkJScWXV4931PtcBVxS-sQP-A= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xRSQp-nzPx9YJcjSE6rpJqJMr777j07qBoTYEz5pwMuc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -10021,58 +10162,59 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -10080,11 +10222,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -10092,23 +10234,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10119,31 +10261,33 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10152,84 +10296,90 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB3_NTC3_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: 0 enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10237,7 +10387,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xHp8Jp1EN-BsW4NNHKXj8wdhUGyxjiO0ZBU36jrCBic4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32JxH2KKlBlMybG2dSQRlSPdHkRx_8g5fo45X9py5Jl3I= BufferLoad: true BufferStore: true CUCount: null @@ -10247,7 +10397,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -10262,15 +10412,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10280,72 +10430,72 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60416 + LdsBytesNoAmax: 68096 LdsInitCVgprs: false - LdsNumBytes: 60416 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 68096 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 43008 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 68096 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10356,23 +10506,25 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -10389,8 +10541,8 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB6_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -10398,29 +10550,29 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 4 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 64 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 64 ThreadTileB: 1 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10428,30 +10580,33 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10465,10 +10620,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10476,7 +10632,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16xMybmPilfmjdxVF3hhgbtCdONGixNfiKnsBpv98ldbF8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x64_MI16x08S02Y53B0Ne6ocNhqpSHhbrCU_jARBa0pnTDvEPOy4= BufferLoad: true BufferStore: true CUCount: null @@ -10486,10 +10642,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -10501,15 +10657,15 @@ ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] @@ -10519,47 +10675,47 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 + LSCA: 32 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 76288 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 76288 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 - LdsPadA: 0 + LdsOffsetMetadata: 76288 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -10567,15 +10723,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10589,29 +10745,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 + NonTemporalA: 4 + NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -10619,7 +10777,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10628,38 +10786,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB5_NTC0_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -10667,30 +10825,33 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 48 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: PartialsBuffer + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10702,12 +10863,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10715,7 +10877,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xX5RiOrSD2VklAE5bBWSRJYAzYfFgytTNDEzPIilTQVc= BufferLoad: true BufferStore: true CUCount: null @@ -10725,108 +10886,108 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC4_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -10834,22 +10995,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10858,7 +11020,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10867,32 +11029,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC4_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10908,34 +11071,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -10945,8 +11108,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10954,7 +11118,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1zEU2zKTgrAqpo4O1jNbgKO5gG6NuTgAwAILYGhhS0vw= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6JGSVgpt4GrxgPWh0ngWHRKLbLQ_tqOXKZqa8Lb0Ms-k= BufferLoad: true BufferStore: true CUCount: null @@ -10964,27 +11128,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -10994,39 +11158,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -11035,37 +11199,37 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -11073,22 +11237,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11106,32 +11271,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11147,11 +11313,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -11159,22 +11325,22 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -11182,10 +11348,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11193,37 +11360,36 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x128_MI164h6utKLGsGw5LYpCWTXGz2yJ1TVk4mfoOsLcmGvytb0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -11233,101 +11399,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 1 - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 65536 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 65536 - LdsOffsetB_Blk: 196608 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 98816 - LdsOffsetMetadata_Blk: 196608 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 8 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11345,32 +11512,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC1_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11386,45 +11554,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11432,38 +11601,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16pW7zOSW31Upy1wP-ePzJYYSpctNFPpPLDnJHKj_nfEw= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1tZVQklGtKOQ3p4IgWyAMn9jTY4tlNPIDxC3Y71614zM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -11472,60 +11641,60 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 128 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] - MIWaveTileA: 8 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 @@ -11537,36 +11706,37 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 4 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 + NumElementsPerBatchStore: 8 NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11575,7 +11745,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11584,18 +11754,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA3_NTB4_NTC6_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -11605,17 +11775,18 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 ThreadTile1: 4 ThreadTileA: 32 ThreadTileB: 4 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11628,42 +11799,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 2 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11671,12 +11843,11 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI16VBp-QCoHOpL6LyENWcMSpXtCN0fwPKVG3tklVSFxaog= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -11685,24 +11856,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -11711,39 +11882,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 64 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 123904 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -11751,11 +11922,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -11763,26 +11934,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 192 MacroTile1: 256 - MacroTileA: 128 + MacroTileA: 192 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -11791,21 +11962,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11814,7 +11986,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11823,38 +11995,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC2_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -11864,45 +12037,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11910,7 +12084,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xAu1FYXTFu65OW_QnFxWNJ9o3fDj1dpn7VH7-NENuD30= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19w6888cdHAWM4y5NYJiddGc0xmSYSG1iOCD6RgfOFHM= BufferLoad: true BufferStore: true CUCount: null @@ -11920,131 +12094,132 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true - DirectToLdsA: true + DirectToLdsA: false DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12053,7 +12228,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12062,32 +12237,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 50 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA7_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12103,45 +12279,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12149,7 +12326,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32vHcR0eKpkE7e7fyejfiaedHWaJa-n0ED8ZaCTBwI9lg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2ej76CXhxc1HCjfC8xyOXAnhi0iATAwRTzc4u3zxpLfA= BufferLoad: true BufferStore: true CUCount: null @@ -12163,24 +12340,24 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -12189,11 +12366,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 LSCA: 256 LSCB: 32 LSPA: 4 @@ -12205,21 +12382,21 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 148736 LdsInitCVgprs: false - LdsNumBytes: 114944 + LdsNumBytes: 148736 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedB: 41600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 74368 LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB_Blk: 107136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata_Blk: 107136 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -12232,7 +12409,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12240,15 +12417,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 320 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12260,7 +12437,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -12268,22 +12445,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 80 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 10 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12301,32 +12479,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 51 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 64 + ThreadTile1: 5 + ThreadTileA: 64 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12342,45 +12521,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12388,12 +12568,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xP3aS4yEsKaJ_WgMEwdb05dsMR2_LceuYOMf1OYI2fWk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2l9pCGbOtMUbslloqKGQiaMAYUl03KI_tqPnmAtre-YA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -12402,24 +12582,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -12428,39 +12608,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 64 + LSCA: 256 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 - LVPA: 4 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 130560 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 130560 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32256 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -12468,38 +12648,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -12507,22 +12687,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 8 + NumLoadsB: 7 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12531,7 +12712,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12540,38 +12721,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 52 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC7_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM6_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12581,34 +12763,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 4 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -12618,8 +12800,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12627,7 +12810,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16wI1VAKWPNdP9RHMqPy0EQsFIoAsqd-rof14TfmS3B7w= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2bO0CyY1hfbzTfbroYwjGkYL-2OnycNC5Ws1vnO03-EM= BufferLoad: true BufferStore: true CUCount: null @@ -12637,28 +12820,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -12667,39 +12850,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 2048 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 133120 + LdsNumBytes: 132096 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 + LdsOffsetA_Blk: 66048 LdsOffsetB: 32768 - LdsOffsetB_Blk: 99328 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -12708,37 +12891,37 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] - MIWaveTileA: 2 - MIWaveTileB: 8 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -12746,22 +12929,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 7 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12779,32 +12963,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 53 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_8_MO40_NTn1_NTA2_NTB4_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -12820,10 +13005,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -12832,24 +13017,24 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -12857,8 +13042,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12866,37 +13052,36 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32UbnKmTyiavN2akJm6VmZtXlrWfhYQJP2E3bwh8fwpA8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -12906,37 +13091,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -12945,11 +13130,11 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -12957,15 +13142,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12977,7 +13162,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -12985,22 +13170,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13018,32 +13204,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 54 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13059,45 +13246,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13105,12 +13293,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32AdmC3-P-DJd8Y5s3WQ6Z37Tt-2n6SRcuWFce7LCusPk= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2hoMDmZeEsYCyvKLfTCHSZhNF7OrTU2H4lTaznjnrQWU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -13119,24 +13307,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -13145,38 +13333,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 LSCA: 256 LSCB: 32 - LSPA: 2 + LSPA: 4 LSPB: 32 - LVCA: 128 + LVCA: 64 LVCB: 8 LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 124416 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 124416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -13188,7 +13376,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13196,15 +13384,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 192 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13216,7 +13404,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -13224,22 +13412,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 8 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13257,38 +13446,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 55 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB2_NTC5_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13301,33 +13491,33 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -13337,6 +13527,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13344,78 +13535,78 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32mu0WQNBr6Bcz0RHbPND4cTi3CAGuaMu12Tv6aUvNdvM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHeN2A1e9y-sybm4-VD6Rl-mXAAhE2oR_OYb0CV2Kb8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 256 + LSCA: 64 LSCB: 32 - LSPA: 1 + LSPA: 16 LSPB: 32 - LVCA: 256 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -13423,11 +13614,11 @@ LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13435,15 +13626,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13455,7 +13646,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -13463,22 +13654,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13496,38 +13688,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 56 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13537,45 +13730,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13583,12 +13777,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI327VropPoEGPK4p81XGOPYK7cGqJ9bMn-cBbte0oLvQa0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2SSMrbopPykWoZTFVNLV97OiwO0KcDliSRSdRIxQZbhY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -13597,64 +13791,64 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x160x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 LSCA: 256 LSCB: 32 - LSPA: 1 + LSPA: 4 LSPB: 32 - LVCA: 256 + LVCA: 64 LVCB: 8 LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 121344 LdsInitCVgprs: false - LdsNumBytes: 116736 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 121344 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -13666,7 +13860,7 @@ LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13674,15 +13868,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 128 + MacroTile1: 160 MacroTileA: 256 - MacroTileB: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13694,7 +13888,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -13702,22 +13896,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 32 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13735,38 +13930,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 57 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC4_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x160x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 2 - ThreadTileA: 64 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13779,31 +13975,31 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -13811,10 +14007,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13822,7 +14019,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x64_MI16xjyuL3eEnd6oM60MxKkSoj12f2_Rm9Y651748TEsy1cM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kPOJu3SjIZ5ID-BcDpMScvXSs_-HeTA7aISuMEhdHHE= BufferLoad: true BufferStore: true CUCount: null @@ -13832,28 +14029,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -13862,51 +14059,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 - LSCB: 64 - LSPA: 64 - LSPB: 16 - LVCA: 4 - LVCB: 16 - LVPA: 16 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 69120 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 69120 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 144896 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 69120 - LdsOffsetMetadata_Blk: 144896 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -13914,26 +14111,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] - MIWaveTileA: 3 + MIWaveTile: [6, 3] + MIWaveTileA: 6 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 192 - MacroTileA: 48 - MacroTileB: 192 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -13941,22 +14138,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 NumLoadsB: 12 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -13964,7 +14162,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -13974,38 +14172,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 58 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_3_MO40_NTn1_NTA1_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 + ThreadTile0: 96 ThreadTile1: 3 - ThreadTileA: 12 + ThreadTileA: 96 ThreadTileB: 3 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14015,45 +14214,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14061,12 +14261,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x384x32_MI16LKEloQ4c6Y11zX7UW67eJt-lIulINZyn1htvg5ifdns= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1CVWcLkzUFVZguyEpkQVePxOfwkSx0oBkGlgi_Q3qok0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -14075,23 +14275,23 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -14101,39 +14301,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA3_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 8 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 101376 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -14141,38 +14341,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [10, 6] - MIWaveTileA: 10 - MIWaveTileB: 6 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 384 - MacroTileA: 160 - MacroTileB: 384 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -14180,22 +14380,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 120 - NumLoadsA: 10 - NumLoadsB: 12 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14204,7 +14405,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14213,18 +14414,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 59 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA3_NTB2_NTC7_NTD5_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -14234,17 +14435,18 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 40 - ThreadTile1: 6 - ThreadTileA: 40 - ThreadTileB: 6 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14254,7 +14456,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -14264,35 +14466,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14300,7 +14503,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x64_MI16x12XhFbXZExtNclf8zD5hKevb_buILS5setEE10T0PVPQ= BufferLoad: true BufferStore: true CUCount: null @@ -14310,27 +14512,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -14340,39 +14542,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: 0 - LSCA: 16 - LSCB: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 LSPA: 16 - LSPB: 2 - LVCA: 4 - LVCB: 32 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 1 - LdsBlockSizePerPadA: 256 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13824 + LdsBytesNoAmax: 123904 LdsInitCVgprs: false - LdsNumBytes: 13824 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -14381,37 +14583,37 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -14419,23 +14621,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14452,36 +14655,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 60 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB6_NTC1_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -14493,45 +14697,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14539,7 +14744,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x64x64_MI16x1U9LsBxJQCPuoeG_Ve_3oyvucaAXJtcLblknVGrUCZZs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6IhMdVhMfTI3xBN4uSqbJxvOYzvtnSVMCVX0u78IWd44= BufferLoad: true BufferStore: true CUCount: null @@ -14549,80 +14754,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA6_NTB7_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 16 - LSCB: 64 - LSPA: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 32 + LVPA: 4 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22528 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 22528 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 37888 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22528 - LdsOffsetMetadata_Blk: 37888 - LdsPadA: 16 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -14630,15 +14835,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14650,31 +14855,32 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 7 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 8 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -14691,31 +14897,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 61 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA6_NTB7_NTC0_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -14735,42 +14942,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14778,7 +14986,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xziis8ptTdenpUsOV8XyKZ82SY2P3qZnzwGDCdyAHeJ8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6k48tNKODxoHFqHvsJzLdXfHVMveYQ1hpLblmE2a0eXQ= BufferLoad: true BufferStore: true CUCount: null @@ -14788,80 +14996,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 53376 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 53376 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -14869,14 +15077,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [4, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -14889,7 +15097,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -14897,22 +15105,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14920,8 +15129,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14930,27 +15139,28 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 62 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC3_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 1 @@ -14971,7 +15181,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -14981,24 +15191,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -15008,8 +15218,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15017,7 +15228,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x32_MI16x1Zz5Ec09HIvd_O008_hCtRDLwTrwyfoXpxKA0U2dQ67Y= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3X7_irXf-O6GzVKxdiclKJFNBvIMfkuxU-ZfFdCvraBc= BufferLoad: true BufferStore: true CUCount: null @@ -15027,80 +15238,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 32 - LSPA: 4 + LSCA: 32 + LSCB: 64 + LSPA: 16 LSPB: 8 - LVCA: 16 - LVCB: 8 + LVCA: 8 + LVCB: 16 LVPA: 4 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12352 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 12352 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2048 - LdsOffsetMetadata_Blk: 10240 - LdsPadA: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -15108,14 +15319,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -15128,31 +15339,32 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 8 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumMbskPrefetchElements: 16 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15160,7 +15372,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15169,27 +15381,28 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 63 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB3_NTC1_NTD0_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 1 @@ -15198,7 +15411,7 @@ TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -15210,7 +15423,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -15220,35 +15433,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15256,90 +15470,90 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xAXUwb1N4QNEBBAkwsktnlZYTW07RiPx_dgvcHi9E9M4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3TVE79Gl-4xyNcHlbhJ0nOncgCUZpRly-oCD4jvG5RmM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 0 - LSCA: 16 - LSCB: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 LSPA: 16 LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 + LVCA: 8 + LVCB: 16 + LVPA: 4 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 28800 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 28800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 43008 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 43008 - LdsPadA: 16 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -15347,14 +15561,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -15367,31 +15581,32 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumMbskPrefetchElements: 16 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15399,7 +15614,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15408,27 +15623,28 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 64 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD1_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 1 @@ -15449,7 +15665,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -15459,35 +15675,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15495,7 +15712,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xY-N0YQfxOyAq02GUONAQj81wtsNxroPjTozQOGUD8fI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nBGszv-xk3760QF81fqkAcHrJvY4h-y0n1eJuTMfXK0= BufferLoad: true BufferStore: true CUCount: null @@ -15505,72 +15722,72 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: 1 - LSCA: 16 - LSCB: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 4 + LSPB: 16 LVCA: 16 - LVCB: 64 - LVPA: 16 - LVPB: 1 - LdsBlockSizePerPadA: 256 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 98816 + LdsNumBytes: 57600 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -15578,7 +15795,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -15586,15 +15803,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15606,7 +15823,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -15614,22 +15831,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 16 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15647,32 +15865,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 65 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB7_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -15688,36 +15907,36 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -15725,8 +15944,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15734,7 +15954,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x32x128_MI16x1w5ApfbK1Jpefo37YDyHXcxDPX0iSzwdDGLbbUauHzQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT12-NO75n14OGnRrcE2zJwZ3cI80SyGufNoJCdW7NONHc= BufferLoad: true BufferStore: true CUCount: null @@ -15744,80 +15964,80 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB6_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 16 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 2 - LdsBlockSizePerPadA: 256 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 106752 LdsInitCVgprs: false - LdsNumBytes: 57856 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 106752 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -15825,14 +16045,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 128 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -15845,7 +16065,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -15853,22 +16073,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15877,7 +16098,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15886,31 +16107,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 66 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB6_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -15930,31 +16152,31 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -15962,10 +16184,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15973,7 +16196,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x32_MI32xRAh7ppQBNzWRsmJpnL4jlXiT-M-iht9O9UTSpXb7f7s= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT18r4dBftj4FCGdnY-zJAIV0H8414jv4jjNP1BBgpXMzg= BufferLoad: true BufferStore: true CUCount: null @@ -15983,110 +16206,110 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35840 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 35840 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35840 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -16094,20 +16317,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16125,32 +16349,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 67 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16166,11 +16391,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -16178,33 +16403,34 @@ WavefrontSize: 64 WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16212,141 +16438,142 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x224x32_MI16xYiD-R4lcoMOg1xQZgGsiJ1KZ02vOjJZGyWHp18MOnjY= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zFRfz_ClGyk9R-xSso3dzQXiskbED_8QHeLF4_pDaC4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA4_NTB7_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 0 LSCA: 64 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 44032 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 44032 + LdsNumBytes: 57600 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 35840 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 8192 - LdsOffsetB_Blk: 73728 + LdsOffsetB_Blk: 40960 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 44032 - LdsOffsetMetadata_Blk: 73728 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 7] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 7 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 224 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 224 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 7 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 56 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 8 - NumLoadsB: 7 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 7 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16355,7 +16582,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16364,32 +16591,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 68 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x224x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA4_NTB7_NTC4_NTD6_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 7 - ThreadTileA: 8 - ThreadTileB: 7 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16405,7 +16633,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -16420,30 +16648,31 @@ WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16451,38 +16680,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xCKUpZ9P25YvYGJ1Acu5q1cb_ng8B3vHgEyiubZYN-vs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VBy1LqzrPAwHi5usd2GrNHxCFiZX3VQhtQmF5HSKuWc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -16491,38 +16720,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 41472 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 41472 - LdsPadA: 8 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -16531,10 +16760,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16542,15 +16771,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16562,7 +16791,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -16570,22 +16799,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16594,7 +16824,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16603,38 +16833,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 69 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 2 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -16644,34 +16875,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -16683,6 +16914,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16690,77 +16922,77 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xA4mLD4rkXZE5Tg5jX-PM_ibHLJxGPZ0dOqagujttYPo= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1lddss4YYeTrRrhI8j4V2ORHSjVm4oVfckN2DehcfM4Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -16768,12 +17000,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16781,15 +17013,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16801,30 +17033,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 8 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16833,7 +17066,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16842,32 +17075,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 70 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16886,42 +17120,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16929,7 +17164,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x256_MI16xQ9WCQXg3kci4s1k-N1_jON3rjIc87HNykcNj6r53DuQ= BufferLoad: true BufferStore: true CUCount: null @@ -16939,28 +17173,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -16969,42 +17203,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 16 - LSCB: 256 - LSPA: 64 - LSPB: 4 - LVCA: 4 - LVCB: 64 - LVPA: 16 - LVPB: 1 - LdsBlockSizePerPadA: 1024 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 115200 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true @@ -17012,7 +17246,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17020,15 +17254,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17040,7 +17274,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -17049,21 +17283,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 3 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17081,32 +17316,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 71 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC3_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17125,31 +17361,31 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -17159,8 +17395,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17168,7 +17405,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI32x3t8AEDJh-qtEP0j1P4XwXYzISUBJp3m20CgLweU5bvQ0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AzIzbvcAXxERNGRmUhNYSaITsRVffmpOLjYiPr-GCjU= BufferLoad: true BufferStore: true CUCount: null @@ -17182,104 +17419,104 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 + LSCA: 128 LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 16 LVCA: 32 - LVCB: 64 - LVPA: 8 + LVCB: 16 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 16 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -17287,22 +17524,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17311,7 +17549,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17320,32 +17558,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 72 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 32 - SubGroupA: 2 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17361,34 +17600,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 2, 4] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -17396,10 +17635,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17407,7 +17647,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xA1XSikclny7NW89DtILrnItshh0pKUMzTEwj8VAfIF8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1hPJ3j3hCQPLY9poEEQoBk953UZJWKGJ-JycqfOwb3oU= BufferLoad: true BufferStore: true CUCount: null @@ -17417,28 +17657,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -17447,37 +17687,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 128 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 49664 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 32768 LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 49664 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -17485,12 +17725,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17498,15 +17738,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17518,30 +17758,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17559,32 +17800,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 73 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17603,42 +17845,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17646,7 +17889,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1MdvvF3WuLvW5zfefjrUPKxH3mZUuZSByYW7fuq5zh-k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wYVQmjqRgJkX091wD_hlsjrCn05Q-VGlKyxjofUMhb4= BufferLoad: true BufferStore: true CUCount: null @@ -17656,28 +17899,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -17686,39 +17929,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 61632 LdsInitCVgprs: false - LdsNumBytes: 98816 + LdsNumBytes: 61632 LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 12480 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -17727,37 +17970,37 @@ LocalWriteUseSgprA: true LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -17765,22 +18008,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 3 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17798,18 +18042,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 74 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC7_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -17819,11 +18063,12 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17839,34 +18084,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -17874,10 +18119,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17885,7 +18131,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1LoQT1m4fy0z4hWOafpHqcYDvq8L8GKbCru3r0T1zEI8= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1MdhDwvT9IqqF7BfDnreJ8GZMAtVVUbrvEVE-1LA6oQs= BufferLoad: true BufferStore: true CUCount: null @@ -17896,26 +18142,26 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false @@ -17925,37 +18171,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC7_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 55808 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 55808 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 23040 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 55808 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -17963,12 +18209,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17976,15 +18222,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17996,30 +18242,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18037,32 +18284,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 75 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB6_NTC7_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 5 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18078,45 +18326,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18124,12 +18373,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16XOuZu64IcidHMClamzevEn0u2nxp4pZshzhk7HOLl9k= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6ljv8NXufAk_cuVEtHqf5txwlBh_uLK29IZNbPQwu2vM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -18138,24 +18387,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -18164,39 +18413,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -18204,38 +18453,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -18243,22 +18492,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 4 - NonTemporalD: 7 - NonTemporalE: 0 - NonTemporalMetadata: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 8 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18266,8 +18516,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18276,38 +18526,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 76 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA2_NTB3_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18320,8 +18571,8 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -18329,24 +18580,24 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -18354,8 +18605,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18363,38 +18615,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI323gLHO3RJpkSFxOlGwg-m6naxNwlrKpvPr78QjVKcUOs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2zGGrW_n4jQYF6klG2DYYqkW_uppB7Bvv5hVyqEj8-a4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -18403,37 +18655,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 LSCB: 32 - LSPA: 32 + LSPA: 4 LSPB: 32 - LVCA: 8 + LVCA: 64 LVCB: 8 - LVPA: 8 + LVPA: 1 LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 + LdsBytesNoAmax: 140416 LdsInitCVgprs: false - LdsNumBytes: 120832 - LdsNumElementsAlignedA: 20480 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 140416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 37440 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20480 - LdsOffsetB_Blk: 86016 + LdsOffsetA_Blk: 70208 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 102976 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 20480 - LdsOffsetMetadata_Blk: 86016 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 102976 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -18441,12 +18693,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18454,15 +18706,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 9] + MIWaveTileA: 2 + MIWaveTileB: 9 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 256 - MacroTileA: 160 - MacroTileB: 256 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18474,9 +18726,9 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -18484,20 +18736,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 - NumLoadsB: 8 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18515,32 +18768,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 77 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA0_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 2 - ThreadTileA: 80 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 9 + ThreadTileA: 32 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18559,42 +18813,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18602,12 +18857,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x256x32_MI16x8aOTHAb3oJ24lDR1GY5QlgCdE9ISN9B4sNQoCcICvs4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2L8kdKU9hAEZTTngWUokj3CFx3UF0ntgM4z3Px34u56w= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -18616,65 +18871,65 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 256 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 64 LVCB: 8 - LVPA: 16 + LVPA: 1 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 47616 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 47616 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 78336 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 47616 - LdsOffsetMetadata_Blk: 78336 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -18682,38 +18937,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 80 + MacroTile0: 256 MacroTile1: 256 - MacroTileA: 80 + MacroTileA: 256 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -18721,22 +18976,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 8 NumLoadsB: 8 - NumLoadsCoalescedA: 5 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18745,7 +19001,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18754,38 +19010,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 78 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB2_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -18798,42 +19055,43 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18841,7 +19099,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1uqhzgX0z42t9VByvRAi39cRVaYcY67x6Oj8SvZwousc= BufferLoad: true BufferStore: true CUCount: null @@ -18852,131 +19109,132 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 0 + LSCA: 128 LSCB: 64 LSPA: 8 - LSPB: 2 - LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 49664 + LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 4 - NonTemporalC: 3 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18984,7 +19242,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18993,36 +19251,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 79 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA5_NTB4_NTC3_NTD1_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -19034,45 +19293,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19080,7 +19340,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xIqJeydFF1_Tl-klQcK_JXBaUx3vbPPgLlD9LFHD3nOM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4pfu4mZMFXl6Px_rOlheAAHmYZJRxgXzi2LZPhoJtkgQ= BufferLoad: true BufferStore: true CUCount: null @@ -19090,27 +19350,27 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -19120,50 +19380,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 128 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19171,15 +19431,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19191,30 +19451,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19232,31 +19493,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 80 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 12 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 12 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -19273,7 +19535,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -19283,35 +19545,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19319,7 +19582,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI32T9xX-njlVFOTQFt-goBtdXkiPVfAmCKuzRDcK3R4fjA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6XOuRIWbxGpO8PYAxal26328SbSHHkRpiiwDFpFzd5cE= BufferLoad: true BufferStore: true CUCount: null @@ -19329,28 +19592,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -19359,37 +19622,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB4_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: 1 - LSCA: 256 - LSCB: 32 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 - LVPA: 1 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 114944 + LdsNumBytes: 132096 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 66048 LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -19399,10 +19662,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19410,15 +19673,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19430,7 +19693,7 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -19438,22 +19701,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19471,32 +19735,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 81 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB4_NTC5_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO4_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 2 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19512,34 +19777,34 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -19549,8 +19814,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19558,37 +19824,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x64x32_MI16xZMetFWyGzjhHcbZJekE_6F7yBfklY-t1A7lJBhtt1oY= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZiLNE3I76KjxLJJ0ZoVjNTn_UpkwS6GssxpuNB9QjBg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -19598,51 +19864,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 1 - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 3072 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 53312 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 25088 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25088 - LdsOffsetB_Blk: 90624 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 90624 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -19650,28 +19916,28 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 64 - MacroTileA: 192 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -19679,20 +19945,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 6 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19710,32 +19977,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 82 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC1_NTD2_NTM0_NEPBS2_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 4 - ThreadTileA: 12 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19751,45 +20019,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19797,38 +20066,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x192x32_MI16idrImgWgCO7nvcJQBWTDXiMZm2WoqSF8tVNvZiYJ_9I= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1_c9FQ4VBUegtbjamFxhptq0xUR4EgDEVCIVltcTf-gY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -19837,101 +20106,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 8 - LdsBlockSizePerPadA: 3072 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 57472 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 24576 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 57472 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 24576 - LdsOffsetB_Blk: 90112 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 90112 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 6] - MIWaveTileA: 6 - MIWaveTileB: 6 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 192 - MacroTileA: 192 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 144 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -19940,7 +20210,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19949,32 +20219,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 83 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x192x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_6_MO40_NTn1_NTA3_NTB2_NTC4_NTD3_NTM0_NEPBS4_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 6 - ThreadTileA: 24 - ThreadTileB: 6 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19990,45 +20261,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 2 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20036,7 +20308,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x64_MI16x1VZJZ9S4ao24zPbzU9SV6fXZLWTiSOVG8G43480XLTao= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6n7nm_pljv8X1U0nWk_dzuTQG8XL_nFBv6qipTuQt2AM= BufferLoad: true BufferStore: true CUCount: null @@ -20050,76 +20322,76 @@ DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB1_NTC6_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: 1 - LSCA: 16 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 2 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 64 - LVPA: 8 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4096 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20127,15 +20399,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20147,7 +20419,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: true NoReject: false @@ -20155,23 +20427,24 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20179,7 +20452,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20188,31 +20461,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 84 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA5_NTB1_NTC6_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -20232,42 +20506,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20275,7 +20550,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x64_MI16x1MslfBQ8HXWroQ8UfCHLcp_snVXSVDA54YqmEx6V5Oes= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6BuOegd-KKwi1l2fNCAczR3-4lBccw6pagnIcRlJE9nQ= BufferLoad: true BufferStore: true CUCount: null @@ -20285,57 +20560,57 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57856 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 57856 + LdsNumBytes: 26624 LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 @@ -20344,72 +20619,73 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 + LdsOffsetMetadata: 26624 LdsOffsetMetadata_Blk: 40960 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20427,32 +20703,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 85 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB1_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20468,11 +20745,11 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -20480,33 +20757,34 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20514,7 +20792,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xSDxhcZ3u4iSYJMHirZW1CTHGblCpuWSDzLc_0f9fOAo= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1RK0QdtuAoNj9ZBkBIF8v6fjg0VzdTSnPVrZGC6eM97o= BufferLoad: true BufferStore: true CUCount: null @@ -20524,28 +20802,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -20554,37 +20832,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 LSCA: 128 - LSCB: 64 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 2 - LVPB: 4 + LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 83968 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 83968 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 83968 + LdsOffsetMetadata_Blk: 147456 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -20592,12 +20870,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20605,15 +20883,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 512 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20625,30 +20903,31 @@ MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20666,32 +20945,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 86 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA2_NTB0_NTC1_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20707,45 +20987,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20753,277 +21034,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16vi3ZztTU2ZooOHejWDnWTKb_gBIj8TaDNK9DtMbhZNs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1sFatWmJdqohNPgHurGIn0vTYTspu7sKSxkns5-iI-Zo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: 0 - LSCA: 256 - LSCB: 32 - LSPA: 1 - LSPB: 32 - LVCA: 256 - LVCB: 8 - LVPA: 1 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 - LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 - MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 3 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 87 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB7_NTC5_NTD3_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 - StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 2 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3Yo9YdRBKBj79FhGiWwDokhFsmBdvtYGmzRh04XflZAM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -21032,11 +21074,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 LSCB: 32 LSPA: 16 @@ -21048,21 +21090,21 @@ LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49280 + LdsBytesNoAmax: 148992 LdsInitCVgprs: false - LdsNumBytes: 49280 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 40960 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8192 - LdsOffsetMetadata_Blk: 40960 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -21070,12 +21112,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21083,266 +21125,27 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] - MaxLDS: 163840 - MaxOccupancy: 40 - MbskPrefetchMethod: 0 - MfmaInitCVgprs: false - NoLdsWriteCode: true - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: true - SFCWGM: - - [1, 1] - - [1, 1] - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 88 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC6_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 - SpaceFillingAlgo: [] - StaggerU: 0 - StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 - StreamKAtomic: 0 - StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false - UseDotInstruction: false - UseF32XEmulation: true - UseInstOffsetForGRO: 0 - UsePLRPack: false - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 - numSubTiles: 1 - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xduzYOnXWtXrhkDsB36ISXJDAIsEQJoO-tbdr76LoaHU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: 4 - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: true - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [9, 5, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 - LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 8 - LdsPadB: 8 - LdsPadMetadata: 0 - LocalReadVectorWidth: 4 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 4 - MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 256 - MacroTileA: 64 - MacroTileB: 256 + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -21352,20 +21155,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21374,7 +21178,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21382,39 +21186,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 89 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21427,42 +21232,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21470,37 +21276,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x160x32_MI16xPdajT8YH9ob0TRWzD1ldbfXmsBG0Yb-qwhjBqEUffYg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4adP3l0wOsgdIOylpn3az32jBDr5TLNASEte2YeMUxfU= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -21510,37 +21316,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1536 - LdsBlockSizePerPadB: 128 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38400 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 38400 - LdsNumElementsAlignedA: 12800 - LdsNumElementsAlignedB: 25600 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 12800 - LdsOffsetB_Blk: 78336 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38400 - LdsOffsetMetadata_Blk: 78336 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 @@ -21550,10 +21356,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21561,15 +21367,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 5] + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 160 - MacroTileA: 96 - MacroTileB: 160 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21581,7 +21387,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -21590,21 +21396,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 3 - NumLoadsB: 5 + NumLoadsB: 4 NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21613,7 +21420,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21621,33 +21428,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 90 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x160x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1536_LBSPPB128_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA0_NTB3_NTC0_NTD6_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 12 - ThreadTile1: 5 + ThreadTile1: 1 ThreadTileA: 12 - ThreadTileB: 5 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21663,7 +21471,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -21673,24 +21481,24 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 @@ -21698,10 +21506,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21709,38 +21518,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16D6x4dco45Qa1J1WWaOfvrVhnUVbkL6MHiGgtcXWPe0g= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9D_9Y2APT5jU_WNRBvEqFeqYqHamCgP7R_Hfo8HPEpQQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -21749,78 +21558,78 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33792 + LdsBytesNoAmax: 61696 LdsInitCVgprs: false - LdsNumBytes: 33792 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 61696 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 45056 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 81920 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 45056 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 8] - MIWaveTileA: 2 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 96 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 96 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -21828,22 +21637,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21851,8 +21661,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21860,33 +21670,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 91 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA1_NTB2_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile0: 48 + ThreadTile1: 1 + ThreadTileA: 48 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -21902,45 +21713,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21948,38 +21760,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16g49CyeI2eVWDPXKcFHBWWzGtiVFxp9FPEDK9D-TTnuI= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6NchRY6fE5EUP0BO9TxVrOOaZY2mNScnULQhBoZehEAg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -21988,50 +21800,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 LDSTrInst: 0 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 2560 - LdsBlockSizePerPadB: 512 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38400 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 38400 - LdsNumElementsAlignedA: 20992 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 20992 - LdsOffsetB_Blk: 86528 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38400 - LdsOffsetMetadata_Blk: 86528 - LdsPadA: 16 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22039,15 +21851,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22059,30 +21871,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22091,7 +21904,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22099,32 +21912,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 92 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2560_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA1_NTB1_NTC1_NTD7_NTM0_NEPBS16_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 20 + ThreadTileA: 8 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -22141,45 +21955,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22187,37 +22002,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1A-I78gXXrdOMKV_Cgj6SLbZMW0uS0cuPNCPj32U4D4A= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT63kDVUoqiyQ9zT3dXF-7c6slWQADFta39wl6nbJ5cQgQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -22227,50 +22042,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 0 - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9216 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 9216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 16 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22278,14 +22093,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [4, 1] MIWaveTile: [1, 2] MIWaveTileA: 1 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -22298,39 +22113,40 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 8 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22338,28 +22154,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 93 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA6_NTB4_NTC1_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 4 ThreadTile1: 2 @@ -22390,35 +22207,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22426,38 +22244,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1jVgAbVl7TAJozgubbJrw62tnMG_eow8t5K7k9PPC4WA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT33tdZ_Ane8GWNMOH9Yy9Tto67iQRR3GWjHCXudpxIhQA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -22466,37 +22284,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: 1 LSCA: 32 - LSCB: 32 - LSPA: 8 + LSCB: 64 + LSPA: 16 LSPB: 8 LVCA: 8 - LVCB: 8 - LVPA: 2 + LVCB: 16 + LVPA: 4 LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 8704 + LdsBytesNoAmax: 28800 LdsInitCVgprs: false - LdsNumBytes: 8704 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 28800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 16384 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 20480 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 20480 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 @@ -22504,12 +22322,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22517,15 +22335,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22537,31 +22355,32 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22577,33 +22396,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 94 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA7_NTB2_NTC4_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -22622,42 +22442,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22665,90 +22486,90 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1TJ8fhKzIV0bLTU9VMa_LELZqPo5QayYI3rGq2ReOzWM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6B6yjCgjH545xofiAKGkklYZ8gjK9Z83FXrAiImiJTZs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14848 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 14848 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14848 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22756,15 +22577,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22776,30 +22597,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 + NumElementsPerBatchStore: 8 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22816,39 +22638,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 95 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -22861,42 +22684,43 @@ UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22904,142 +22728,143 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x96x32_MI16x1hlDZ2RQXNBkW0XTWcWRLJPdhG2QWccTGpKZ8d7Cm7gs= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kJnpDlmDlWwyslngZ-EgItmmqUmgLMBdJbRM__aOPpk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: 0 - LSCA: 32 + LSCA: 128 LSCB: 32 - LSPA: 4 - LSPB: 16 + LSPA: 8 + LSPB: 32 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 19456 + LdsBytesNoAmax: 53312 LdsInitCVgprs: false - LdsNumBytes: 19456 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 36864 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 19456 - LdsOffsetMetadata_Blk: 36864 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 8 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 96 - MacroTileA: 32 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -23047,7 +22872,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23055,33 +22880,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 96 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC4_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23097,45 +22923,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [128, 2, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23143,7 +22970,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x32_MI16xiJ6qFagB646z5Vtl5Hvm-aiCfDXZyxNPc89xWsDkpZY= BufferLoad: true BufferStore: true CUCount: null @@ -23154,80 +22980,80 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true + DirectToLds: 0 DirectToLdsA: false - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: 1 - LSCA: 16 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 16 + LVPA: 4 LVPB: 8 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 64768 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 64768 - LdsNumElementsAlignedA: 6656 - LdsNumElementsAlignedB: 25344 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 6656 - LdsOffsetB_Blk: 39424 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 6656 - LdsOffsetMetadata_Blk: 39424 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -23235,26 +23061,26 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 192 - MacroTileA: 48 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -23263,21 +23089,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23286,7 +23113,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23294,33 +23121,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 97 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB5_NTC6_NTD6_NTM0_NEPBS10_NLCA3_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23336,45 +23164,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23382,90 +23211,90 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x384x32_MI16xxRLiYtk8vs0qL-PXaqBrMaldMaCGmntAX1C6Jqy-srA= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6kJPO6d_JImjSWiXy_pK7JIg8RH_UP-eyBc2sQYIVtBk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 - LSCA: 16 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 16 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 62976 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 62976 - LdsNumElementsAlignedA: 7680 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 7680 - LdsOffsetB_Blk: 73216 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 62976 - LdsOffsetMetadata_Blk: 73216 - LdsPadA: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23473,15 +23302,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 6] - MIWaveTileA: 3 - MIWaveTileB: 6 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 384 - MacroTileA: 48 - MacroTileB: 384 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23493,30 +23322,31 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 72 - NumLoadsA: 6 - NumLoadsB: 12 - NumLoadsCoalescedA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23524,8 +23354,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23533,39 +23363,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 98 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x384x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_6_MO40_NTn1_NTA1_NTB7_NTC1_NTD4_NTM0_NEPBS16_NLCA3_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM16_WGMXCC8_WGMXCCGn1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 6 - ThreadTileA: 12 - ThreadTileB: 6 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -23585,35 +23416,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23621,38 +23453,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI16x1J1zmPJrEydRWke4dDOrYNixyg3y7uYK8qYrzIoZUnhg= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT34wSAq7cxuMKvQcmOPfg83xDDcTX3QGS-2yBFKS0j_is= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -23661,50 +23493,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB2_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 + LdsBytesNoAmax: 57600 LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 8192 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8192 - LdsOffsetB_Blk: 24576 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 12800 - LdsOffsetMetadata_Blk: 24576 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23713,14 +23545,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 1] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23734,37 +23566,37 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23772,19 +23604,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 99 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA5_NTB2_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -23794,11 +23626,12 @@ SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23814,45 +23647,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23860,38 +23694,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1ykZbW8kjS_Mwo8JgpLmIp5amBOKXc9UByonlcmoBU14= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1gnSdpgupEOvJOiRh45yYCWk8ntMiLyYPoi-roHcedwc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -23900,104 +23734,103 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 LDSTrInst: 1 - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 31232 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 31232 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 31232 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 48 - MacroTileA: 64 - MacroTileB: 48 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24013,33 +23846,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 100 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC8_WGMXCCGn1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24053,50 +23887,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24104,120 +23936,120 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x128x32_MI32xcsdRVX0ybSkhmdCuVjb4BJMOuJqgW1kHyForr86LIZ4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Puz217WUimK6XrH1Pxjc7FrKKVyACIt5zgqPXoHkRJM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LSCA: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65024 + LdsBytesNoAmax: 24704 LdsInitCVgprs: false - LdsNumBytes: 65024 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 24704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13824 - LdsOffsetMetadata_Blk: 46592 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -24225,23 +24057,22 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24257,39 +24088,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 101 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 48 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 48 + ThreadTileA: 4 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -24297,12 +24129,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -24312,35 +24141,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24348,90 +24178,90 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT320x160x32_MI16gSO2xcUFkAwgSU8xxy1gJJLLVRqWr-K3o6BUJWoBh3E= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1db7_bixCXfVFtS0qxdSmo0ipeanArAwXytvCrGrHgPA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5120_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 - LSCB: 32 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 128 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 5120 - LdsBlockSizePerPadB: 256 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 64512 + LdsBytesNoAmax: 27648 LdsInitCVgprs: false - LdsNumBytes: 64512 - LdsNumElementsAlignedA: 41472 - LdsNumElementsAlignedB: 23040 + LdsNumBytes: 27648 + LdsNumElementsAlignedA: 10240 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 41472 - LdsOffsetB_Blk: 107008 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 43008 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 64512 - LdsOffsetMetadata_Blk: 107008 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 43008 LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24439,15 +24269,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [5, 10] - MIWaveTileA: 5 - MIWaveTileB: 10 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 160 - MacroTileA: 320 - MacroTileB: 160 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24474,18 +24304,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 200 - NumGlobalWriteVectorsPerThread: 200 - NumLoadsA: 10 - NumLoadsB: 5 - NumLoadsCoalescedA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24493,7 +24321,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24501,33 +24329,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 102 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x160x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA5120_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_10_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA5_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 10 - ThreadTileA: 20 - ThreadTileB: 10 + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24541,50 +24370,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24592,7 +24419,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3tgew_wmZ1d78svsdgjmhnAmiMolGNWzJ-VQ5Kmkq7s0= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WZ5hxp8HqtUBdolum7CrEAeNqQRa5V_wUt1YK_HUitM= BufferLoad: true BufferStore: true CUCount: null @@ -24602,134 +24429,132 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 64 - LSCB: 64 + LSCA: 16 + LSCB: 256 LSPA: 16 - LSPB: 16 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98560 + LdsBytesNoAmax: 37376 LdsInitCVgprs: false - LdsNumBytes: 98560 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16384 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 16 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24745,32 +24570,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 103 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -24785,12 +24611,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -24800,35 +24623,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24836,7 +24660,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xsSK0i4V19YfEXRCT7lci-LV6L-p6I3IHEgHrfhVIEvM= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Jf69jrN9Z4-iQ2t2y60MsQTFsDB9_gIxrHRzz2oaiiQ= BufferLoad: true BufferStore: true CUCount: null @@ -24847,26 +24671,26 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -24876,80 +24700,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 16 LSCB: 64 - LSPA: 8 + LSPA: 64 LSPB: 16 - LVCA: 32 + LVCA: 4 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 23552 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 23552 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 23552 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -24957,23 +24781,22 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24989,33 +24812,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 104 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25029,50 +24853,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25080,7 +24902,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xFcTBM0p4-B4u4RHcDFjzYdbTkiu3QTU3Jb2iD88fU9Q= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT42RnkXZHyflklHCbcqmB86XH77wK5L-QkDV3ZxLjGKKw= BufferLoad: true BufferStore: true CUCount: null @@ -25091,26 +24913,26 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false @@ -25120,80 +24942,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: 0 - LSCA: 128 + LSCA: 16 LSCB: 64 - LSPA: 8 + LSPA: 64 LSPB: 16 - LVCA: 32 + LVCA: 4 LVCB: 16 - LVPA: 2 + LVPA: 16 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 31744 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 48 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 48 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -25201,23 +25023,22 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25233,37 +25054,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 105 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -25273,50 +25095,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: 0 enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25324,7 +25144,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xd5KB-3Wsu99IEswXfBw5F2zAdcz0rxdn-nsREODY4wE= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6o6Mqj-RUjXtkwZ1K2RrFXlC3Q-a2y2vEOhYluZ4pCok= BufferLoad: true BufferStore: true CUCount: null @@ -25335,27 +25155,27 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -25364,104 +25184,103 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 25600 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25477,37 +25296,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 106 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -25517,50 +25337,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25568,12 +25386,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16x8uILL1fCyh1qTBHYDw8Fhlq-ej8sl5xDCmV3PfmJi3g= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT20mCDNMpt7viEEIOBxhxMtxX9Mc4gjN024wHUZQhAVXc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -25582,24 +25400,24 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -25608,38 +25426,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 1 - LSCA: 128 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 LSCB: 64 - LSPA: 8 + LSPA: 32 LSPB: 16 - LVCA: 32 + LVCA: 8 LVCB: 16 - LVPA: 2 + LVPA: 8 LVPB: 4 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 189440 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49664 - LdsOffsetMetadata_Blk: 98304 - LdsPadA: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 189440 + LdsPadA: 16 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 @@ -25651,7 +25469,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -25659,15 +25477,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -25679,7 +25497,7 @@ MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -25688,24 +25506,23 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalB: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 7 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -25721,37 +25538,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 107 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -25761,50 +25579,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25812,7 +25628,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xRSQp-nzPx9YJcjSE6rpJqJMr777j07qBoTYEz5pwMuc= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yFcFPlsGw5B8R9atFrCRXX3pXqiO9SpMqMMeOGD1qxE= BufferLoad: true BufferStore: true CUCount: null @@ -25826,65 +25642,65 @@ DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: 0 - LSCA: 64 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 4 + LSPB: 1 LVCA: 16 - LVCB: 16 + LVCB: 64 LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 50176 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 50176 - LdsNumElementsAlignedA: 16384 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16384 - LdsOffsetB_Blk: 81920 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 50176 - LdsOffsetMetadata_Blk: 81920 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -25892,38 +25708,38 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 - MbskPrefetchMethod: 0 + MbskPrefetchMethod: 1 MfmaInitCVgprs: false NoLdsWriteCode: false NoReject: false @@ -25933,30 +25749,29 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumMbskPrefetchElements: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -25965,33 +25780,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 108 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26005,41 +25821,38 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -26049,6 +25862,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26056,115 +25870,115 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32jCbGOzCWj1ZReUDKW2CBof4sEzcT56K5GViFAN354XE= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KLwYNiZFx8XC0SNCEsNLwUVAoXl9bsjrmRA1WIofO6w= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 256 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [9, 5, 0] InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 1 + LVCA: 16 + LVCB: 256 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 37376 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 163840 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 163840 - LdsPadA: 0 - LdsPadB: 4 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -26175,25 +25989,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26209,37 +26021,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 109 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: false UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -26249,39 +26062,36 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 @@ -26291,8 +26101,9 @@ tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26300,38 +26111,37 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x10774ejW-y_fCHxcslzP6G-lYIY7kmGqxfZavG0O9sDI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -26340,80 +26150,80 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 LDSTrInst: 1 - LSCA: 16 + LSCA: 128 LSCB: 64 - LSPA: 64 + LSPA: 8 LSPB: 16 - LVCA: 4 + LVCA: 32 LVCB: 16 - LVPA: 16 + LVPA: 2 LVPB: 4 - LdsBlockSizePerPadA: 768 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 64512 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 64512 - LdsNumElementsAlignedA: 13312 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13312 - LdsOffsetB_Blk: 46080 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 13312 - LdsOffsetMetadata_Blk: 46080 - LdsPadA: 16 - LdsPadB: 8 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 48 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -26426,18 +26236,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26453,33 +26261,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 110 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS12_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26493,50 +26301,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26544,38 +26350,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32JxH2KKlBlMybG2dSQRlSPdHkRx_8g5fo45X9py5Jl3I= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6x3uoXhTokfgXutkdVLR7kOydsFIqKOj0o71H_2PuOx8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -26584,50 +26390,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 LDSTrInst: 1 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 68096 + LdsBytesNoAmax: 132096 LdsInitCVgprs: false - LdsNumBytes: 68096 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 164352 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 68096 - LdsOffsetMetadata_Blk: 164352 - LdsPadA: 4 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26635,15 +26441,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26657,7 +26463,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -26665,14 +26471,14 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -26680,8 +26486,6 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26697,39 +26501,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC4_WGMXCCGn1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 2 - SubGroup1: 128 + SubGroup1: 32 SubGroupA: 2 - SubGroupB: 128 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -26737,50 +26541,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26788,7 +26590,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xi2Jigudy-18K0UMT5ct2MdW8qfs4vqYPQZeulKeB3FE= BufferLoad: true BufferStore: true CUCount: null @@ -26798,28 +26599,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true - DirectToVgprA: false - DirectToVgprB: false + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -26828,37 +26629,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 LSCA: 128 - LSCB: 64 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 32 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 + LVPB: 8 + LdsBlockSizePerPadA: 0 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 114944 + LdsBytesNoAmax: 106880 LdsInitCVgprs: false - LdsNumBytes: 114944 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 106880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24960 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 98304 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32768 - LdsOffsetMetadata_Blk: 98304 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 LdsPadA: 0 LdsPadB: 4 LdsPadMetadata: 0 @@ -26868,10 +26669,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26879,15 +26680,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 192 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26910,22 +26711,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26941,33 +26740,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM2_WGMXCC32_WGMXCCGn1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26981,50 +26780,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27032,7 +26829,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32x3gfKViGSlyTzbeVb2aUOzEk6CS8J7voKdng2XPN-XE4= BufferLoad: true BufferStore: true CUCount: null @@ -27042,28 +26838,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -27072,27 +26868,27 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: 0 - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 114944 LdsInitCVgprs: false - LdsNumBytes: 49664 + LdsNumBytes: 114944 LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 16640 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -27101,7 +26897,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 49664 + LdsOffsetMetadata: 32768 LdsOffsetMetadata_Blk: 98304 LdsPadA: 0 LdsPadB: 4 @@ -27110,12 +26906,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27124,14 +26920,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27145,7 +26941,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -27153,14 +26949,14 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -27168,15 +26964,13 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -27185,22 +26979,22 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM4_WGMXCC2_WGMXCCGn1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -27208,10 +27002,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27225,50 +27019,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [128, 2, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27276,38 +27068,38 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI32pZ9lLvhCdmH7RpWQsDwDXdHIV2y4SsFgkBGs2DezxyQ= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjSUpYYGuR7UmIaRfL8rbbKYy6b1BuxrRuEwINN-ueM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -27316,104 +27108,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: 1 - LSCA: 128 + LSCA: 16 LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 0 - LdsBlockSizePerPadB: 512 + LSPA: 16 + LSPB: 1 + LVCA: 4 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 66560 + LdsBytesNoAmax: 25088 LdsInitCVgprs: false - LdsNumBytes: 66560 - LdsNumElementsAlignedA: 32768 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 32768 - LdsOffsetB_Blk: 163840 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 66560 - LdsOffsetMetadata_Blk: 163840 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 + NonTemporalA: 4 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -27429,33 +27219,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 2 - ThreadTileA: 32 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27469,50 +27259,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false + enableGLTrA: 0 + enableGLTrB: 0 enableLDSTrA: false enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 1 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27520,7 +27308,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x64_MI16x08S02Y53B0Ne6ocNhqpSHhbrCU_jARBa0pnTDvEPOy4= + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3wPOxEyiXQz96xeSKClb2IOrvTQ46X8sHYWHz5Dqg9Lc= BufferLoad: true BufferStore: true CUCount: null @@ -27530,28 +27318,28 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: false - DirectToVgprB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 2 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: false GuaranteeNoPartialB: true @@ -27560,50 +27348,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: 0 + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 32 + LSPB: 2 + LVCA: 8 + LVCB: 128 LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 76288 + LdsBytesNoAmax: 57856 LdsInitCVgprs: false - LdsNumBytes: 76288 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 76288 - LdsOffsetMetadata_Blk: 139776 - LdsPadA: 8 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27611,15 +27399,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 256 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 256 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27633,7 +27421,7 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false @@ -27642,22 +27430,20 @@ NonTemporalA: 4 NonTemporalB: 4 NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -27673,39 +27459,39 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 115 - SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA2_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_4_MO40_NTn1_NTA4_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM48_WGMXCC8_WGMXCCGn1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -27713,279 +27499,302 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 48 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 - enableGLTrA: false - enableGLTrB: false - enableLDSTrA: 0 - enableLDSTrB: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - [2, 3, 0, 1] - - - [4, 30, 8192, 128] - - [60, 0.0] + - [21, 0.0] - - [16, 33, 8192, 128] - - [61, 0.0] + - [102, 8.99] - - [40, 61, 8192, 128] - - [0, 0.0] + - [103, 18.28] - - [128, 17711, 1, 960] - - [1, 0.0] + - [75, 58.48] - - [128, 17711, 1, 2480] - - [2, 0.0] + - [0, 0.0] - - [252, 17711, 1, 128] - - [3, 0.0] + - [74, 39.0] - - [256, 17711, 1, 128] - - [4, 0.0] + - [1, 0.0] - - [384, 246, 1, 17711] - - [82, 0.0] + - [28, 0.0] - - [384, 768, 1, 17711] - - [83, 0.0] + - [110, 73.58] - - [928, 17711, 1, 128] - - [101, 103696.0] + - [56, 59.92] - - [2732, 17711, 1, 384] - - [102, 179908.0] + - [55, 90.61] - - [6, 128, 17711, 41] - - [5, 0.0] + - [2, 0.0] - - [20, 124, 17711, 48] - - [6, 0.0] + - [3, 0.0] - - [41, 6, 17711, 128] - - [7, 0.0] + - [4, 0.0] - - [256, 256, 41, 17711] - - [8, 0.0] + - [5, 0.0] - - [1, 1, 1, 4096] - - [9, 0.0] + - [106, 0.0] - - [1, 4096, 1, 256] - - [64, 0.0] + - [101, 0.18] - - [1, 4096, 1, 512] - - [65, 0.0] + - [101, 0.3] - - [28, 4096, 1, 256] - - [66, 0.0] + - [97, 4.54] - - [28, 4096, 1, 320] - - [10, 0.0] + - [92, 5.04] - - [57, 262144, 1, 32] - - [67, 0.0] + - [22, 0.0] - - [64, 102400, 1, 64] - - [68, 0.0] + - [76, 24.71] - - [64, 131072, 1, 64] - - [69, 0.0] + - [23, 0.0] - - [64, 131072, 1, 128] - - [70, 0.0] + - [95, 36.73] - - [64, 819200, 1, 64] - - [11, 0.0] + - [76, 29.19] - - [72, 4096, 1, 256] - - [12, 0.0] + - [93, 9.21] - - [72, 4096, 1, 320] - - [13, 0.0] + - [91, 10.63] - - [82, 262144, 1, 32] - - [14, 0.0] + - [89, 16.68] - - [116, 4096, 1, 256] - - [15, 0.0] + - [96, 14.16] - - [116, 4096, 1, 320] - - [16, 0.0] + - [65, 16.26] - - [128, 4096, 1, 2048] - - [17, 0.0] + - [90, 44.23] - - [128, 131072, 1, 64] - - [76, 0.0] + - [26, 0.0] - - [160, 655360, 1, 10] - - [77, 0.0] + - [58, 7.73] - - [180, 4096, 1, 256] - - [18, 0.0] + - [88, 19.1] - - [180, 4096, 1, 320] - - [110, 47842.7] + - [80, 21.73] - - [192, 655360, 1, 48] - - [19, 0.0] + - [87, 34.07] - - [192, 655360, 1, 112] - - [20, 0.0] + - [50, 52.18] - - [224, 527553, 1, 64] - - [21, 0.0] + - [77, 39.78] - - [224, 752863, 1, 64] - - [22, 0.0] + - [78, 41.01] - - [256, 1, 1, 4096] - - [79, 0.0] + - [107, 0.1] - - [256, 4096, 1, 28] - - [23, 0.0] + - [84, 4.17] - - [256, 4096, 1, 72] - - [24, 0.0] + - [94, 9.58] - - [256, 4096, 1, 116] - - [25, 0.0] + - [82, 13.24] - - [256, 4096, 1, 180] - - [26, 0.0] + - [82, 17.83] - - [256, 4096, 1, 256] - - [27, 0.0] + - [81, 26.1] - - [256, 4096, 1, 7680] - - [81, 0.0] + - [111, 86.69] - - [288, 806154, 1, 64] - - [28, 0.0] + - [86, 40.27] - - [512, 1, 1, 4096] - - [84, 0.0] + - [107, 0.2] - - [512, 4096, 1, 1] - - [29, 0.0] + - [85, 0.27] - - [512, 4096, 1, 160] - - [30, 0.0] + - [83, 31.19] - - [512, 4096, 1, 512] - - [31, 0.0] + - [79, 54.79] - - [512, 4096, 1, 2246] - - [32, 0.0] + - [98, 78.03] - - [512, 4096, 1, 9216] - - [87, 0.0] + - [30, 0.0] - - [512, 4096, 1, 30816] - - [33, 0.0] + - [6, 0.0] - - [1600, 4096, 1, 128] - - [89, 0.0] + - [59, 52.68] - - [1824, 4096, 1, 2048] - - [90, 0.0] + - [32, 0.0] - - [2048, 4096, 1, 57] - - [34, 0.0] + - [59, 29.92] - - [2048, 4096, 1, 64] - - [111, 59310.0] + - [43, 59310.0] - - [2048, 4096, 1, 82] - - [91, 0.0] + - [33, 0.0] - - [2048, 4096, 1, 160] - - [35, 0.0] + - [59, 64.51] - - [2048, 4096, 1, 2048] - - [36, 0.0] + - [7, 0.0] - - [2246, 4096, 1, 2048] - - [37, 0.0] + - [57, 106.06] - - [2560, 4096, 1, 4096] - - [92, 0.0] + - [34, 0.0] - - [2624, 4096, 1, 2048] - - [38, 0.0] + - [60, 115.68] - - [25, 25, 8192, 32] - - [93, 0.0] + - [35, 0.0] - - [32, 25, 8192, 25] - - [94, 0.0] + - [36, 0.0] - - [32, 57, 4096, 64] - - [95, 0.0] + - [37, 0.0] - - [32, 82, 4096, 64] - - [96, 0.0] + - [38, 0.0] - - [48, 192, 4096, 160] - - [97, 0.0] + - [39, 0.0] - - [48, 642, 4096, 160] - - [98, 0.0] + - [40, 0.0] - - [64, 32, 4096, 200] - - [99, 0.0] + - [104, 22.3] - - [200, 32, 4096, 64] - - [39, 0.0] + - [105, 15.84] - - [256, 2048, 1, 128] - - [40, 0.0] + - [8, 0.0] - - [512, 2048, 1, 14336] - - [41, 0.0] + - [9, 0.0] - - [1024, 2048, 1, 128] - - [88, 0.0] + - [31, 0.0] - - [1024, 2048, 1, 14336] - - [42, 0.0] + - [10, 0.0] - - [1, 8192, 1, 128] - - [43, 0.0] + - [11, 0.0] - - [1, 8192, 1, 256] - - [44, 0.0] + - [100, 0.3] - - [120, 8192, 1, 256] - - [103, 52872.0] + - [41, 52872.0] - - [128, 1, 1, 8192] - - [45, 0.0] + - [112, 0.1] - - [128, 8192, 1, 256] - - [46, 0.0] + - [66, 25.59] - - [128, 8192, 1, 2440] - - [47, 0.0] + - [67, 53.83] - - [128, 8192, 1, 5120] - - [48, 0.0] + - [12, 0.0] - - [128, 8192, 1, 5640] - - [49, 0.0] + - [13, 0.0] - - [256, 1, 1, 8192] - - [50, 0.0] + - [113, 0.19] - - [256, 8192, 1, 512] - - [105, 124161.0] + - [70, 54.43] - - [256, 8192, 1, 528] - - [106, 117688.0] + - [71, 50.73] - - [256, 8192, 1, 2048] - - [51, 0.0] + - [72, 71.81] - - [256, 98304, 1, 128] - - [52, 0.0] + - [14, 0.0] - - [512, 8192, 1, 120] - - [107, 69570.3] + - [73, 35.22] - - [512, 8192, 1, 512] - - [53, 0.0] + - [15, 0.0] - - [512, 8192, 1, 528] - - [54, 0.0] + - [16, 0.0] - - [512, 8192, 1, 1980] - - [55, 0.0] + - [17, 0.0] - - [512, 8192, 1, 2048] - - [56, 0.0] + - [18, 0.0] - - [512, 8192, 1, 3072] - - [57, 0.0] + - [19, 0.0] - - [528, 8192, 1, 256] - - [58, 0.0] + - [20, 0.0] - - [10880, 8192, 1, 128] - - [59, 0.0] + - [54, 65.6] - - [1, 1024, 1, 128] - - [62, 0.0] + - [99, 0.02] - - [1, 4096, 1, 1] - - [63, 0.0] + - [99, 0.0] - - [128, 1, 1, 1024] - - [71, 0.0] + - [24, 0.0] - - [128, 41, 1, 17711] - - [72, 0.0] + - [25, 0.0] - - [128, 1024, 1, 128] - - [73, 0.0] + - [64, 2.92] - - [128, 1024, 1, 4096] - - [74, 0.0] + - [61, 28.91] - - [128, 1024, 1, 7456] - - [75, 0.0] + - [109, 39.35] - - [128, 17711, 1, 128] - - [100, 57292.0] + - [75, 25.45] - - [233, 131072, 1, 56] - - [78, 0.0] + - [27, 0.0] - - [256, 1024, 1, 128] - - [80, 0.0] + - [62, 5.57] - - [512, 1024, 1, 128] - - [85, 0.0] + - [65, 9.94] - - [512, 1024, 1, 2011] - - [86, 0.0] + - [29, 0.0] - - [4096, 1024, 1, 128] - - [113, 83849.7] + - [46, 44.33] - - [32, 233, 1024, 128] - - [115, 53858.6] + - [44, 53858.6] - - [256, 8192, 1, 256] - - [104, 88136.7] + - [70, 40.67] - - [512, 8192, 1, 256] - - [108, 122522.0] + - [42, 122522.0] - - [1024, 8192, 1, 512] - - [109, 187269.0] + - [48, 95.36] - - [2011, 1024, 1, 512] - - [112, 117998.0] + - [47, 55.01] - - [7968, 1024, 1, 256] - - [114, 135836.0] + - [45, 76.41] + - - [3072, 8192, 1, 512] + - [49, 103.91] + - - [4352, 8192, 1, 256] + - [50, 87.22] + - - [4608, 8192, 1, 256] + - [51, 86.41] + - - [5120, 8192, 1, 128] + - [52, 63.17] + - - [5640, 8192, 1, 128] + - [53, 58.59] + - - [7296, 8192, 1, 128] + - [51, 65.76] + - - [4132, 4096, 1, 256] + - [58, 77.55] + - - [4132, 4096, 1, 512] + - [58, 98.72] + - - [128, 1024, 1, 1] + - [63, 0.03] + - - [256, 8192, 1, 1] + - [68, 0.28] + - - [256, 8192, 1, 120] + - [69, 22.65] + - - [256, 4096, 1, 1] + - [84, 0.15] + - - [256, 1024, 1, 7968] + - [108, 59.32] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml new file mode 100644 index 00000000000..f5e8e378f9e --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Ailk_Bljk_S_MX_B_UserArgs.yaml @@ -0,0 +1,16924 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 0058] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DataTypeA: 0 + DataTypeAmaxD: 0 + DataTypeB: 0 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 10 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6JGSVgpt4GrxgPWh0ngWHRKLbLQ_tqOXKZqa8Lb0Ms-k= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1tZVQklGtKOQ3p4IgWyAMn9jTY4tlNPIDxC3Y71614zM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123904 + LdsInitCVgprs: false + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19w6888cdHAWM4y5NYJiddGc0xmSYSG1iOCD6RgfOFHM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2ej76CXhxc1HCjfC8xyOXAnhi0iATAwRTzc4u3zxpLfA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148736 + LdsInitCVgprs: false + LdsNumBytes: 148736 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 41600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74368 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 107136 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 107136 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 320 + MacroTileA: 256 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 320 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x320x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 5 + ThreadTileA: 64 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2l9pCGbOtMUbslloqKGQiaMAYUl03KI_tqPnmAtre-YA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 130560 + LdsInitCVgprs: false + LdsNumBytes: 130560 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 32256 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 7] + MIWaveTileA: 2 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 224 + MacroTileA: 256 + MacroTileB: 224 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 112 + NumLoadsA: 8 + NumLoadsB: 7 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 7 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x224x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2bO0CyY1hfbzTfbroYwjGkYL-2OnycNC5Ws1vnO03-EM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2hoMDmZeEsYCyvKLfTCHSZhNF7OrTU2H4lTaznjnrQWU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 124416 + LdsInitCVgprs: false + LdsNumBytes: 124416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 6] + MIWaveTileA: 2 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 192 + MacroTileA: 256 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 8 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 6 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x192x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1QHeN2A1e9y-sybm4-VD6Rl-mXAAhE2oR_OYb0CV2Kb8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2SSMrbopPykWoZTFVNLV97OiwO0KcDliSRSdRIxQZbhY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x160x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 121344 + LdsInitCVgprs: false + LdsNumBytes: 121344 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 160 + MacroTileA: 256 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x160x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 5 + ThreadTileA: 32 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kPOJu3SjIZ5ID-BcDpMScvXSs_-HeTA7aISuMEhdHHE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1CVWcLkzUFVZguyEpkQVePxOfwkSx0oBkGlgi_Q3qok0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 123904 + LdsInitCVgprs: false + LdsNumBytes: 123904 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 90112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 90112 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 4] + MIWaveTileA: 3 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 192 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x256x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 4 + ThreadTileA: 48 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6IhMdVhMfTI3xBN4uSqbJxvOYzvtnSVMCVX0u78IWd44= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6k48tNKODxoHFqHvsJzLdXfHVMveYQ1hpLblmE2a0eXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53376 + LdsInitCVgprs: false + LdsNumBytes: 53376 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3X7_irXf-O6GzVKxdiclKJFNBvIMfkuxU-ZfFdCvraBc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30208 + LdsInitCVgprs: false + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3TVE79Gl-4xyNcHlbhJ0nOncgCUZpRly-oCD4jvG5RmM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 28800 + LdsInitCVgprs: false + LdsNumBytes: 28800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nBGszv-xk3760QF81fqkAcHrJvY4h-y0n1eJuTMfXK0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT12-NO75n14OGnRrcE2zJwZ3cI80SyGufNoJCdW7NONHc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 106752 + LdsInitCVgprs: false + LdsNumBytes: 106752 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT18r4dBftj4FCGdnY-zJAIV0H8414jv4jjNP1BBgpXMzg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zFRfz_ClGyk9R-xSso3dzQXiskbED_8QHeLF4_pDaC4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1VBy1LqzrPAwHi5usd2GrNHxCFiZX3VQhtQmF5HSKuWc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 41472 + LdsInitCVgprs: false + LdsNumBytes: 41472 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 41472 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1lddss4YYeTrRrhI8j4V2ORHSjVm4oVfckN2DehcfM4Y= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 115200 + LdsInitCVgprs: false + LdsNumBytes: 115200 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AzIzbvcAXxERNGRmUhNYSaITsRVffmpOLjYiPr-GCjU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 133120 + LdsInitCVgprs: false + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1hPJ3j3hCQPLY9poEEQoBk953UZJWKGJ-JycqfOwb3oU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 49664 + LdsInitCVgprs: false + LdsNumBytes: 49664 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49664 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wYVQmjqRgJkX091wD_hlsjrCn05Q-VGlKyxjofUMhb4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61632 + LdsInitCVgprs: false + LdsNumBytes: 61632 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 12480 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 3] + MIWaveTileA: 1 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x96x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1MdhDwvT9IqqF7BfDnreJ8GZMAtVVUbrvEVE-1LA6oQs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55808 + LdsInitCVgprs: false + LdsNumBytes: 55808 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55808 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6ljv8NXufAk_cuVEtHqf5txwlBh_uLK29IZNbPQwu2vM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2zGGrW_n4jQYF6klG2DYYqkW_uppB7Bvv5hVyqEj8-a4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 140416 + LdsInitCVgprs: false + LdsNumBytes: 140416 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 37440 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 70208 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 102976 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 102976 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 9] + MIWaveTileA: 2 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 288 + MacroTileA: 256 + MacroTileB: 288 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 8 + NumLoadsB: 9 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 9 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x288x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 9 + ThreadTileA: 32 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT2L8kdKU9hAEZTTngWUokj3CFx3UF0ntgM4z3Px34u56w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 163840 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 163840 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 8] + MIWaveTileA: 2 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 49664 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4pfu4mZMFXl6Px_rOlheAAHmYZJRxgXzi2LZPhoJtkgQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31744 + LdsInitCVgprs: false + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6XOuRIWbxGpO8PYAxal26328SbSHHkRpiiwDFpFzd5cE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 8 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZiLNE3I76KjxLJJ0ZoVjNTn_UpkwS6GssxpuNB9QjBg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53312 + LdsInitCVgprs: false + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1_c9FQ4VBUegtbjamFxhptq0xUR4EgDEVCIVltcTf-gY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57472 + LdsInitCVgprs: false + LdsNumBytes: 57472 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8320 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6n7nm_pljv8X1U0nWk_dzuTQG8XL_nFBv6qipTuQt2AM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6BuOegd-KKwi1l2fNCAczR3-4lBccw6pagnIcRlJE9nQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 40960 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 40960 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB128_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1RK0QdtuAoNj9ZBkBIF8v6fjg0VzdTSnPVrZGC6eM97o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 83968 + LdsInitCVgprs: false + LdsNumBytes: 83968 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 147456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 83968 + LdsOffsetMetadata_Blk: 147456 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x512x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB512_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1sFatWmJdqohNPgHurGIn0vTYTspu7sKSxkns5-iI-Zo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 148992 + LdsInitCVgprs: false + LdsNumBytes: 148992 + LdsNumElementsAlignedA: 24576 + LdsNumElementsAlignedB: 49920 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 74496 + LdsOffsetB: 24576 + LdsOffsetB_Blk: 99072 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 99072 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 384 + MacroTileA: 192 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 288 + NumGlobalWriteVectorsPerThread: 144 + NumLoadsA: 6 + NumLoadsB: 12 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT192x384x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 96 + ThreadTile1: 3 + ThreadTileA: 96 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT4adP3l0wOsgdIOylpn3az32jBDr5TLNASEte2YeMUxfU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31744 + LdsInitCVgprs: false + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9D_9Y2APT5jU_WNRBvEqFeqYqHamCgP7R_Hfo8HPEpQQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 61696 + LdsInitCVgprs: false + LdsNumBytes: 61696 + LdsNumElementsAlignedA: 12288 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 12288 + LdsOffsetB_Blk: 45056 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 12288 + LdsOffsetMetadata_Blk: 45056 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT96x128x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 48 + ThreadTile1: 1 + ThreadTileA: 48 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6NchRY6fE5EUP0BO9TxVrOOaZY2mNScnULQhBoZehEAg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT63kDVUoqiyQ9zT3dXF-7c6slWQADFta39wl6nbJ5cQgQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT33tdZ_Ane8GWNMOH9Yy9Tto67iQRR3GWjHCXudpxIhQA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 28800 + LdsInitCVgprs: false + LdsNumBytes: 28800 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 24576 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 8192 + LdsOffsetMetadata_Blk: 24576 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6B6yjCgjH545xofiAKGkklYZ8gjK9Z83FXrAiImiJTZs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1kJnpDlmDlWwyslngZ-EgItmmqUmgLMBdJbRM__aOPpk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 53312 + LdsInitCVgprs: false + LdsNumBytes: 53312 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 4160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 8192 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8192 + LdsOffsetB_Blk: 73728 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 73728 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB256_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6kJPO6d_JImjSWiXy_pK7JIg8RH_UP-eyBc2sQYIVtBk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT34wSAq7cxuMKvQcmOPfg83xDDcTX3QGS-2yBFKS0j_is= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57600 + LdsInitCVgprs: false + LdsNumBytes: 57600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8448 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1gnSdpgupEOvJOiRh45yYCWk8ntMiLyYPoi-roHcedwc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114944 + LdsInitCVgprs: false + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Puz217WUimK6XrH1Pxjc7FrKKVyACIt5zgqPXoHkRJM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24704 + LdsInitCVgprs: false + LdsNumBytes: 24704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4224 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1db7_bixCXfVFtS0qxdSmo0ipeanArAwXytvCrGrHgPA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 27648 + LdsInitCVgprs: false + LdsNumBytes: 27648 + LdsNumElementsAlignedA: 10240 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 10240 + LdsOffsetB_Blk: 43008 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 43008 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1WZ5hxp8HqtUBdolum7CrEAeNqQRa5V_wUt1YK_HUitM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37376 + LdsInitCVgprs: false + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Jf69jrN9Z4-iQ2t2y60MsQTFsDB9_gIxrHRzz2oaiiQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23552 + LdsInitCVgprs: false + LdsNumBytes: 23552 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23552 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT42RnkXZHyflklHCbcqmB86XH77wK5L-QkDV3ZxLjGKKw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdsBlockSizePerPadA: 768 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31744 + LdsInitCVgprs: false + LdsNumBytes: 31744 + LdsNumElementsAlignedA: 13312 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 13312 + LdsOffsetB_Blk: 46080 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31744 + LdsOffsetMetadata_Blk: 46080 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA768_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS6_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6o6Mqj-RUjXtkwZ1K2RrFXlC3Q-a2y2vEOhYluZ4pCok= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25600 + LdsInitCVgprs: false + LdsNumBytes: 25600 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT20mCDNMpt7viEEIOBxhxMtxX9Mc4gjN024wHUZQhAVXc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdsBlockSizePerPadA: 3584 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 58368 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 58368 + LdsOffsetB_Blk: 189440 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 189440 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [7, 1] + MIWaveTileA: 7 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 224 + MacroTile1: 32 + MacroTileA: 224 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 14 + NumLoadsB: 2 + NumLoadsCoalescedA: 7 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumMbskPrefetchElements: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA3584_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS7_NLCA7_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 1 + ThreadTileA: 28 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1yFcFPlsGw5B8R9atFrCRXX3pXqiO9SpMqMMeOGD1qxE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 4 + LSPB: 1 + LVCA: 16 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 1 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumMbskPrefetchElements: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KLwYNiZFx8XC0SNCEsNLwUVAoXl9bsjrmRA1WIofO6w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 16 + LSCB: 256 + LSPA: 16 + LSPB: 1 + LVCA: 16 + LVCB: 256 + LVPA: 16 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 37376 + LdsInitCVgprs: false + LdsNumBytes: 37376 + LdsNumElementsAlignedA: 20480 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 20480 + LdsOffsetB_Blk: 86016 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 37376 + LdsOffsetMetadata_Blk: 86016 + LdsPadA: 16 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA1_GRVWB1_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA16_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114944 + LdsInitCVgprs: false + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_2_2_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 2, 2] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6x3uoXhTokfgXutkdVLR7kOydsFIqKOj0o71H_2PuOx8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4 + LDSTrInst: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 132096 + LdsInitCVgprs: false + LdsNumBytes: 132096 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 66048 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT64x64x128_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_2_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 32 + SubGroupA: 2 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 2, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 106880 + LdsInitCVgprs: false + LdsNumBytes: 106880 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 24960 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 81920 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 81920 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 192 + MacroTileA: 128 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT128x192x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 + LDSTrInst: 1 + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 114944 + LdsInitCVgprs: false + LdsNumBytes: 114944 + LdsNumElementsAlignedA: 32768 + LdsNumElementsAlignedB: 16640 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 32768 + LdsOffsetB_Blk: 98304 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 98304 + LdsPadA: 0 + LdsPadB: 4 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [32, 32, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC1_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA0_LBSPPB1024_LBSPPM0_LPA0_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM6_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zjSUpYYGuR7UmIaRfL8rbbKYy6b1BuxrRuEwINN-ueM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 1 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 1 + LVCA: 4 + LVCB: 64 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 25088 + LdsInitCVgprs: false + LdsNumBytes: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4096 + LdsOffsetMetadata_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3wPOxEyiXQz96xeSKClb2IOrvTQ46X8sHYWHz5Dqg9Lc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 2 + LVCA: 8 + LVCB: 128 + LVPA: 8 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 + LdsInitCVgprs: false + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 4 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB1_GSUn1_GSUAMB_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA0_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB4_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: 1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: 0 + enableGLTrB: 0 + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [7968, 1024, 1, 256] + - [0, 76.41] + - - [4096, 1024, 1, 128] + - [1, 44.33] + - - [2011, 1024, 1, 512] + - [2, 55.01] + - - [1024, 8192, 1, 512] + - [3, 95.36] + - - [3072, 8192, 1, 512] + - [4, 103.91] + - - [4352, 8192, 1, 256] + - [5, 87.22] + - - [4608, 8192, 1, 256] + - [6, 86.41] + - - [5120, 8192, 1, 128] + - [7, 63.17] + - - [5640, 8192, 1, 128] + - [8, 58.59] + - - [7296, 8192, 1, 128] + - [6, 65.76] + - - [10880, 8192, 1, 128] + - [9, 65.6] + - - [2732, 17711, 1, 384] + - [10, 90.61] + - - [928, 17711, 1, 128] + - [11, 59.92] + - - [2246, 4096, 1, 2048] + - [12, 106.06] + - - [4132, 4096, 1, 256] + - [13, 77.55] + - - [2048, 4096, 1, 160] + - [14, 64.51] + - - [4132, 4096, 1, 512] + - [13, 98.72] + - - [2624, 4096, 1, 2048] + - [15, 115.68] + - - [2048, 4096, 1, 57] + - [14, 29.92] + - - [1600, 4096, 1, 128] + - [14, 52.68] + - - [128, 1024, 1, 4096] + - [16, 28.91] + - - [256, 1024, 1, 128] + - [17, 5.57] + - - [128, 1024, 1, 1] + - [18, 0.03] + - - [128, 1024, 1, 128] + - [19, 2.92] + - - [512, 1024, 1, 128] + - [20, 9.94] + - - [128, 8192, 1, 256] + - [21, 25.59] + - - [128, 8192, 1, 2440] + - [22, 53.83] + - - [256, 8192, 1, 1] + - [23, 0.28] + - - [256, 8192, 1, 120] + - [24, 22.65] + - - [256, 8192, 1, 256] + - [25, 40.67] + - - [256, 8192, 1, 512] + - [25, 54.43] + - - [256, 8192, 1, 528] + - [26, 50.73] + - - [256, 8192, 1, 2048] + - [27, 71.81] + - - [512, 8192, 1, 120] + - [28, 35.22] + - - [252, 17711, 1, 128] + - [29, 39.0] + - - [128, 17711, 1, 128] + - [30, 25.45] + - - [128, 17711, 1, 960] + - [30, 58.48] + - - [64, 819200, 1, 64] + - [31, 29.19] + - - [224, 527553, 1, 64] + - [32, 39.78] + - - [224, 752863, 1, 64] + - [33, 41.01] + - - [512, 4096, 1, 512] + - [34, 54.79] + - - [180, 4096, 1, 320] + - [35, 21.73] + - - [256, 4096, 1, 256] + - [36, 26.1] + - - [256, 4096, 1, 180] + - [37, 17.83] + - - [512, 4096, 1, 160] + - [38, 31.19] + - - [256, 4096, 1, 116] + - [37, 13.24] + - - [256, 4096, 1, 28] + - [39, 4.17] + - - [512, 4096, 1, 1] + - [40, 0.27] + - - [256, 4096, 1, 1] + - [39, 0.15] + - - [192, 655360, 1, 112] + - [5, 52.18] + - - [288, 806154, 1, 64] + - [41, 40.27] + - - [192, 655360, 1, 48] + - [42, 34.07] + - - [116, 4096, 1, 320] + - [20, 16.26] + - - [180, 4096, 1, 256] + - [43, 19.1] + - - [82, 262144, 1, 32] + - [44, 16.68] + - - [128, 4096, 1, 2048] + - [45, 44.23] + - - [72, 4096, 1, 320] + - [46, 10.63] + - - [28, 4096, 1, 320] + - [47, 5.04] + - - [64, 102400, 1, 64] + - [31, 24.71] + - - [72, 4096, 1, 256] + - [48, 9.21] + - - [256, 4096, 1, 72] + - [49, 9.58] + - - [160, 655360, 1, 10] + - [13, 7.73] + - - [64, 131072, 1, 128] + - [50, 36.73] + - - [116, 4096, 1, 256] + - [51, 14.16] + - - [28, 4096, 1, 256] + - [52, 4.54] + - - [512, 4096, 1, 2246] + - [53, 78.03] + - - [1, 1024, 1, 128] + - [54, 0.02] + - - [1, 8192, 1, 256] + - [55, 0.3] + - - [1, 4096, 1, 512] + - [56, 0.3] + - - [1, 4096, 1, 256] + - [56, 0.18] + - - [1, 4096, 1, 1] + - [54, 0.0] + - - [16, 33, 8192, 128] + - [57, 8.99] + - - [40, 61, 8192, 128] + - [58, 18.28] + - - [64, 32, 4096, 200] + - [59, 22.3] + - - [200, 32, 4096, 64] + - [60, 15.84] + - - [1, 1, 1, 4096] + - [61, 0.0] + - - [512, 1, 1, 4096] + - [62, 0.2] + - - [256, 1, 1, 4096] + - [62, 0.1] + - - [256, 1024, 1, 7968] + - [63, 59.32] + - - [128, 1024, 1, 7456] + - [64, 39.35] + - - [384, 768, 1, 17711] + - [65, 73.58] + - - [256, 4096, 1, 7680] + - [66, 86.69] + - - [128, 1, 1, 8192] + - [67, 0.1] + - - [256, 1, 1, 8192] + - [68, 0.19] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml index 06582460569..47eccaf2f45 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs.yaml @@ -82,6 +82,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -131,7 +132,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -240,7 +241,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB5_NTC4_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -318,6 +319,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -325,12 +327,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x128x32_MI32vSYX90ppDprEavxj8aYo4_RVHyec3tByzM_fy0-Q5p0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16ng6z_ayzORq6mLQC9NmCp022CyXmT20mCLXbBbe-m5o= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -355,7 +357,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -367,7 +369,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB7_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -377,26 +379,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 110592 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 110592 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -404,11 +406,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -416,23 +418,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 128 - MacroTileA: 192 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -444,21 +446,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 7 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 96 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -467,7 +469,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -476,7 +478,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB7_NTC1_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -484,24 +486,24 @@ StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 48 - ThreadTile1: 2 - ThreadTileA: 48 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -519,14 +521,14 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 @@ -554,6 +556,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -561,20 +564,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16ng6z_ayzORq6mLQC9NmCp022CyXmT20mCLXbBbe-m5o= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16sZHCGNr8NOjsJZjJoKLx2D9w2iDDWktSsPwK2KagM0I= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -585,8 +588,8 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -603,14 +606,14 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 @@ -638,8 +641,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -673,28 +676,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 3 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 5 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 NumElementsPerThread: 256 NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 32 + NumLoadsB: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 32 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -712,13 +715,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB2_NTC6_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: true StoreSyncOpt: 1 @@ -726,7 +729,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -763,7 +766,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 32 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -786,10 +789,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -797,20 +801,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16sZHCGNr8NOjsJZjJoKLx2D9w2iDDWktSsPwK2KagM0I= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32HDLrFKOalrtw2KSVTZ7XOtxf5e65xHrmK1BWhomWgmY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -821,8 +825,8 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -839,98 +843,98 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 32 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 32 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -939,7 +943,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -948,38 +952,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB7_NTC5_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 1 + ThreadTile0: 64 + ThreadTile1: 1 + ThreadTileA: 64 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -992,15 +996,15 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1013,7 +1017,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -1022,10 +1026,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1033,7 +1038,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32HDLrFKOalrtw2KSVTZ7XOtxf5e65xHrmK1BWhomWgmY= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x288x32_MI16jlcgKB9N4SyOB1ZF1cNtZuU9XEPy1lSQN0hbLApWEVk= BufferLoad: true BufferStore: true CUCount: null @@ -1063,7 +1068,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1075,7 +1080,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -1085,26 +1090,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 147456 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 147456 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 46080 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 73728 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -1112,35 +1117,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 9] + MIWaveTileA: 6 + MIWaveTileB: 9 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 192 + MacroTile1: 288 + MacroTileA: 192 + MacroTileB: 288 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -1152,21 +1157,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 4 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 216 + NumGlobalWriteVectorsPerThread: 108 + NumLoadsA: 6 + NumLoadsB: 9 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 9 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1175,7 +1180,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -1184,32 +1189,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB2_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 + ThreadTile0: 24 + ThreadTile1: 9 + ThreadTileA: 24 + ThreadTileB: 9 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1227,7 +1232,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -1249,7 +1254,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -1262,6 +1267,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1269,12 +1275,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x32x32_MI32xpBVJCKYVdddJ9ZWJn_y6OBqYt3-BIpDKWLuXbExc_BA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x224x32_MI16xExhRy6GXUDd_JG6KWPknC-ImNIhRsFD21cB5RtDRgf4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -1293,13 +1299,13 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1311,36 +1317,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 4 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 22016 + LdsBytesNoAmax: 38400 LdsInitCVgprs: false - LdsNumBytes: 22016 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 38400 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 35840 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 68096 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 22016 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 38400 + LdsOffsetMetadata_Blk: 68096 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -1348,35 +1354,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 7] + MIWaveTileA: 1 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 224 + MacroTileA: 16 + MacroTileB: 224 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -1388,21 +1394,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 6 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 28 + NumGlobalWriteVectorsPerThread: 28 + NumLoadsA: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1420,17 +1426,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_2_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM6_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -1442,10 +1448,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 7 + ThreadTileA: 4 + ThreadTileB: 7 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1463,16 +1469,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 6 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1494,10 +1500,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1505,17 +1512,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x96x64_MI16x1b_n0pL9wvqKZmPxvI_atrAx5msSEDab_k9FB6A5LHIE= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI16xFTZn1oT96nznuMUL1F717od6NlDNu3Zl5rceKzvu6pw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -1525,7 +1532,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -1535,7 +1542,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1547,34 +1554,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 120832 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 120832 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -1584,8 +1591,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -1595,15 +1602,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 96 - MacroTile1: 96 - MacroTileA: 96 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1623,22 +1630,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 3 + NonTemporalC: 6 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 6 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -1646,7 +1653,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -1656,38 +1663,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -1699,23 +1706,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -1734,6 +1741,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1741,20 +1749,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x288x32_MI16jlcgKB9N4SyOB1ZF1cNtZuU9XEPy1lSQN0hbLApWEVk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xMGDb4WqTAUNdXONPi4f_hVkBQx-lOyX3fNSJSClAZJM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -1771,7 +1779,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -1783,34 +1791,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 + LdsBytesNoAmax: 60224 LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 46080 + LdsNumBytes: 60224 + LdsNumElementsAlignedA: 6336 + LdsNumElementsAlignedB: 21120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 6336 + LdsOffsetB_Blk: 39104 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27648 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 6336 + LdsOffsetMetadata_Blk: 39104 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -1818,8 +1826,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -1831,15 +1839,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 9] - MIWaveTileA: 6 - MIWaveTileB: 9 + MIWaveGroup: [1, 2] + MIWaveTile: [3, 5] + MIWaveTileA: 3 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 288 - MacroTileA: 192 - MacroTileB: 288 + MacroTile0: 48 + MacroTile1: 160 + MacroTileA: 48 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -1853,29 +1861,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 1 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 216 - NumGlobalWriteVectorsPerThread: 108 - NumLoadsA: 6 - NumLoadsB: 9 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 3 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 9 - NumThreads: 256 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 10 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -1892,32 +1900,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x288x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_9_MO40_NTn1_NTA0_NTB3_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 + StreamKXCCMapping: 0 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 9 - ThreadTileA: 24 - ThreadTileB: 9 + ThreadTile0: 12 + ThreadTile1: 5 + ThreadTileA: 12 + ThreadTileB: 5 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -1935,16 +1943,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -1966,10 +1974,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -1977,20 +1986,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x128x32_MI16xNV2XQhEHlt21rI1bJFo8vAnWSfz8zM76EEZNUDVuySI= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xceJrtbRzoZ8U_X3c8TU-g5N3iVDPEh4CXnMQuJUum-0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2019,34 +2028,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 25088 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 7680 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 7680 - LdsOffsetB_Blk: 40448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25088 - LdsOffsetMetadata_Blk: 40448 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2054,12 +2063,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -2067,15 +2076,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 4] - MIWaveTileA: 3 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 128 - MacroTileA: 48 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2089,29 +2098,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 2 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 3 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -2128,13 +2137,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_4_MO40_NTn1_NTA2_NTB0_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -2143,17 +2152,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 4 - ThreadTileA: 12 - ThreadTileB: 4 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2172,22 +2181,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -2202,10 +2211,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2213,31 +2223,31 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x224x32_MI16xExhRy6GXUDd_JG6KWPknC-ImNIhRsFD21cB5RtDRgf4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xKd3xOQJnZvH7NER92h_FWeDLMN6llBgbuSKPfS-dh_8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -2255,34 +2265,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 38400 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 38400 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 35840 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 68096 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 38400 - LdsOffsetMetadata_Blk: 68096 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2290,10 +2300,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -2303,15 +2313,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 7] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 7 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 224 - MacroTileA: 16 - MacroTileB: 224 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2325,36 +2335,36 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalB: 6 + NonTemporalC: 6 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 4 - NumLoadsB: 14 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 14 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -2364,12 +2374,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x224x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_7_MO40_NTn1_NTA0_NTB5_NTC0_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM6_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -2379,17 +2389,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 7 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 7 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2408,22 +2418,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -2442,6 +2452,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2449,7 +2460,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI16xFTZn1oT96nznuMUL1F717od6NlDNu3Zl5rceKzvu6pw= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16_yRrZ54pxRgRukp1Qzk0QYQwx3MBBzFzyA0wCMrV_ho= BufferLoad: true BufferStore: true CUCount: null @@ -2479,7 +2490,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2491,7 +2502,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -2501,24 +2512,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2539,14 +2550,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -2570,18 +2581,18 @@ NonTemporalA: 1 NonTemporalB: 3 NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -2591,7 +2602,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -2600,32 +2611,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -2643,16 +2654,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -2678,6 +2689,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2685,17 +2697,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x128x32_MI16xc_P3dMzRTIuQ28njVtEwaNCN95ZzWdJfgzk_qtoBxJM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x1tj_kzs63yw3mFQwAEJiL4iTevb3pL7nbZor8PNO8HKQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -2705,7 +2717,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -2715,7 +2727,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -2727,34 +2739,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 32256 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 + LdsNumBytes: 32256 + LdsNumElementsAlignedA: 13824 LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 46592 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 32256 + LdsOffsetMetadata_Blk: 46592 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -2764,10 +2776,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -2776,14 +2788,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [3, 1] + MIWaveTileA: 3 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 48 + MacroTile1: 64 + MacroTileA: 48 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -2805,19 +2817,19 @@ NonTemporal: -1 NonTemporalA: 1 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 3 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -2826,7 +2838,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -2836,17 +2848,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB0_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 + StoreSyncOpt: 0 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -2858,16 +2870,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 12 + ThreadTile1: 1 + ThreadTileA: 12 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -2879,8 +2891,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -2888,14 +2900,14 @@ WavefrontSize: 64 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -2914,6 +2926,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -2921,7 +2934,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x160x32_MI16xMGDb4WqTAUNdXONPi4f_hVkBQx-lOyX3fNSJSClAZJM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3okxSER2kK2UA9ov1OBTPhQlusC6KlbBX40fJhWBRxnQ= BufferLoad: true BufferStore: true CUCount: null @@ -2932,9 +2945,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -2963,99 +2976,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 60224 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 60224 - LdsNumElementsAlignedA: 6336 - LdsNumElementsAlignedB: 21120 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 6336 - LdsOffsetB_Blk: 39104 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 6336 - LdsOffsetMetadata_Blk: 39104 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [3, 5] - MIWaveTileA: 3 - MIWaveTileB: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 160 - MacroTileA: 48 - MacroTileB: 160 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 - NonTemporalC: 1 + NonTemporalA: 1 + NonTemporalB: 0 + NonTemporalC: 6 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 3 - NumLoadsB: 10 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 10 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -3063,7 +3076,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3072,7 +3085,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_5_MO40_NTn1_NTA3_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3088,22 +3101,22 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 32 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 32 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 5 - ThreadTileA: 12 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3122,9 +3135,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3150,6 +3163,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3157,20 +3171,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xe1BNDmI7JV_4uGeYH8WBcxkTo3OOZcdUmGTChlcXZl4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1vHhtns6gPgv2zdnViNHWKe69UlMziiY8ogzUIM-cXLA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3199,47 +3213,47 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB5_NTC2_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -3247,15 +3261,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3269,28 +3283,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3308,7 +3322,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB5_NTC2_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3324,9 +3338,9 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -3358,16 +3372,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 2] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -3386,6 +3400,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3393,7 +3408,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16x8OQVJ6szc4qGEeyzu1Mvik5oji8_0NULO2KTqu03Wgo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16axuOnARtwf3Zw_BH37e0vrXw3uX3YI2sxTIy41cp-BQ= BufferLoad: true BufferStore: true CUCount: null @@ -3403,7 +3418,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -3435,34 +3450,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB5_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3472,8 +3487,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3483,14 +3498,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [2, 2] MIWaveTile: [4, 4] MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -3512,22 +3527,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 5 + NonTemporalB: 4 NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -3535,7 +3550,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -3544,24 +3559,24 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB5_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 + StreamKXCCMapping: 8 + SubGroup0: 8 SubGroup1: 32 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -3570,12 +3585,12 @@ ThreadTile1: 4 ThreadTileA: 16 ThreadTileB: 4 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3594,16 +3609,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -3618,10 +3633,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3629,7 +3645,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x32_MI16xQeJZY1hhTV13Iu-kbxWY2AO4qcIeNqeUeKM_ciFj3tg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1j604NGXToiBb5XPUVM0hkt4WXLh6NV34LIceG_D6gTg= BufferLoad: true BufferStore: true CUCount: null @@ -3671,7 +3687,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -3682,23 +3698,23 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 36864 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 36864 + LdsNumBytes: 57344 LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 9216 - LdsOffsetB_Blk: 74752 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 36864 - LdsOffsetMetadata_Blk: 74752 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3720,14 +3736,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 6] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 6 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 192 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 192 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3747,22 +3763,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 2 + NonTemporalA: 3 + NonTemporalB: 1 + NonTemporalC: 6 NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 2 - NumLoadsB: 6 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -3780,7 +3796,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_6_MO40_NTn1_NTA2_NTB4_NTC2_NTD7_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -3794,7 +3810,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -3803,15 +3819,15 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 6 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 6 - TransposeLDS: 1 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -3824,15 +3840,15 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -3858,6 +3874,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -3865,20 +3882,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xceJrtbRzoZ8U_X3c8TU-g5N3iVDPEh4CXnMQuJUum-0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1leQ8mjkIFyIZuDPwBWpvxiJZrseNgqKruLAXi8hCJNQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -3895,7 +3912,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -3907,34 +3924,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 57344 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 57344 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -3942,10 +3959,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -3955,15 +3972,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -3977,28 +3994,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 2 + NonTemporalA: 2 + NonTemporalB: 1 + NonTemporalC: 4 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4016,7 +4033,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB6_NTC2_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -4026,22 +4043,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4059,23 +4076,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -4090,10 +4107,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4101,17 +4119,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT96x64x64_MI16x1ZYzOMFyvQ_WZv4g-rr80VpnruR-8YsAaclS0uKKTkXQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xt2IWy6oc3iaHRd0qwAxR-vlfdl6TmzLLHgr_zeUr1t0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -4131,7 +4149,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4143,36 +4161,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB4_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 45056 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 45056 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 45056 - LdsOffsetMetadata_Blk: 93184 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -4181,10 +4199,10 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -4192,23 +4210,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [3, 2] - MIWaveTileA: 3 - MIWaveTileB: 2 + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -4219,22 +4237,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 1 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 6 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4252,32 +4270,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA1_NTB4_NTC2_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 2 - ThreadTileA: 12 - ThreadTileB: 2 + ThreadTile0: 32 + ThreadTile1: 1 + ThreadTileA: 32 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -4295,23 +4313,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -4326,10 +4344,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4337,20 +4356,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT112x320x32_MI16G4GEHbAdPUcJY9T4f4Vw8xA9HnOy9eBCKZR-rA6UYjQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI162pz3v-E27-meies96c7sXE26vltpCAQ0pdpY2inCyGg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4361,13 +4380,13 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4379,34 +4398,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x320x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 123904 + LdsBytesNoAmax: 69632 LdsInitCVgprs: false - LdsNumBytes: 123904 - LdsNumElementsAlignedA: 16128 - LdsNumElementsAlignedB: 42240 + LdsNumBytes: 69632 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16128 - LdsOffsetB_Blk: 81664 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 165888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16128 - LdsOffsetMetadata_Blk: 81664 + LdsOffsetMetadata: 69632 + LdsOffsetMetadata_Blk: 165888 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4414,8 +4433,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -4427,15 +4446,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [7, 5] - MIWaveTileA: 7 - MIWaveTileB: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 112 - MacroTile1: 320 - MacroTileA: 112 - MacroTileB: 320 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4449,28 +4468,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 140 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 14 - NumLoadsB: 10 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4488,38 +4507,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x320x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA0_NTB6_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 5 - ThreadTileA: 28 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4531,16 +4550,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -4566,6 +4585,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4573,20 +4593,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x128_MI16xKd3xOQJnZvH7NER92h_FWeDLMN6llBgbuSKPfS-dh_8= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16Rai2BMU5qUNE8ec14Xy8sjzYfYE4wTy8B1l9bpYhNTM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4603,7 +4623,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -4615,34 +4635,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4650,10 +4670,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -4663,15 +4683,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4685,28 +4705,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 6 + NonTemporalA: 1 + NonTemporalB: 0 NonTemporalC: 6 - NonTemporalD: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4715,7 +4735,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4724,38 +4744,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB6_NTC6_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + StreamKXCCMapping: 8 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -4767,29 +4787,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -4802,6 +4822,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -4809,20 +4830,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1yIHfX59pen49io9SVtqOmH7NfRyptehvc2aTbGBwn6Y= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI16_hmfRslJ7_UM9E5_Eh9cjKTPKXxOaGMYtgTpwcJhKMM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -4851,34 +4872,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB5_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 58112 + LdsBytesNoAmax: 125952 LdsInitCVgprs: false - LdsNumBytes: 58112 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 125952 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 49664 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 49664 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -4886,10 +4907,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -4899,15 +4920,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 8] + MIWaveTileA: 5 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 160 + MacroTile1: 256 + MacroTileA: 160 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -4921,28 +4942,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 + NonTemporalA: 3 + NonTemporalB: 0 NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 160 + NumGlobalWriteVectorsPerThread: 160 + NumLoadsA: 5 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -4951,7 +4972,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -4960,32 +4981,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB5_NTC1_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 20 + ThreadTile1: 8 + ThreadTileA: 20 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5004,22 +5025,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5034,10 +5055,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5045,27 +5067,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI16_yRrZ54pxRgRukp1Qzk0QYQwx3MBBzFzyA0wCMrV_ho= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x160x32_MI16seqDTzLmKwfO-jGw_7q0e0kRLQ9kjh4yCHvuyvaH2cs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -5075,7 +5097,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5087,7 +5109,7 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -5097,24 +5119,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 29568 + LdsNumElementsAlignedB: 21120 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 29568 + LdsOffsetB_Blk: 95104 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 29568 + LdsOffsetMetadata_Blk: 95104 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5122,8 +5144,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -5136,14 +5158,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [7, 5] + MIWaveTileA: 7 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 224 + MacroTile1: 160 + MacroTileA: 224 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5157,28 +5179,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalB: 1 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 140 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 7 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 5 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5186,7 +5208,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -5196,7 +5218,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB3_NTC6_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5206,11 +5228,11 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -5218,10 +5240,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 28 + ThreadTile1: 5 + ThreadTileA: 28 + ThreadTileB: 5 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5239,8 +5261,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -5248,7 +5270,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -5270,10 +5292,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5281,7 +5304,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x1c8C3ENIpF0Lj2Vn5y8emDWGgLXKNNOk6OuhBiu-fS00= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xN2l8jPNpvUrR9ihG-g4bbj4txPWtM9VLB2cEdM2wQDo= BufferLoad: true BufferStore: true CUCount: null @@ -5291,10 +5314,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -5323,34 +5346,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB5_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 46592 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5358,12 +5381,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5371,15 +5394,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 64 - MacroTileA: 48 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -5393,27 +5416,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 6 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -5432,13 +5455,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_1_MO40_NTn1_NTA7_NTB5_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -5446,24 +5469,24 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 12 + ThreadTileA: 4 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -5482,16 +5505,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5506,10 +5529,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5517,17 +5541,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x64x64_MI16x1tj_kzs63yw3mFQwAEJiL4iTevb3pL7nbZor8PNO8HKQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1KDrJ4Ic4uP1T91fz80L-1p9uxPjaRg8FouAoxpGOl90= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -5537,7 +5561,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -5559,34 +5583,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 30720 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 46592 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 46592 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -5596,10 +5620,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -5607,14 +5631,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [3, 1] - MIWaveTileA: 3 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 48 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 48 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -5635,22 +5659,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -5658,7 +5682,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -5668,13 +5692,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT3_1_MO40_NTn1_NTA1_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -5683,17 +5707,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 1 - ThreadTileA: 12 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5712,22 +5736,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -5746,6 +5770,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5753,12 +5778,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x12f43K0NWsDi5cNJjR7R2JTcg_yqtfEHj2PuqgoJwtDk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3u9DB_8S6rR7nconKZLZIT-er2SW2W92RF0q4O62ZiGo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -5783,7 +5808,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -5795,36 +5820,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -5832,35 +5857,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -5872,22 +5897,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 6 + NonTemporalB: 3 + NonTemporalC: 5 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 + NumElementsPerBatchStore: 16 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -5904,7 +5929,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -5914,22 +5939,22 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -5947,14 +5972,14 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -5982,6 +6007,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -5989,7 +6015,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3okxSER2kK2UA9ov1OBTPhQlusC6KlbBX40fJhWBRxnQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3Jm5CiGdfbFXujnDqBqMXTdu4rLuafV0fP7zUlp_z2K4= BufferLoad: true BufferStore: true CUCount: null @@ -6009,7 +6035,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -6031,34 +6057,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -6079,14 +6105,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -6107,30 +6133,30 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 3 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalC: 7 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -6140,7 +6166,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -6155,9 +6181,9 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 + SubGroup0: 2 SubGroup1: 64 - SubGroupA: 4 + SubGroupA: 2 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false @@ -6166,12 +6192,12 @@ ThreadTile1: 1 ThreadTileA: 16 ThreadTileB: 1 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6190,7 +6216,7 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -6218,6 +6244,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6225,7 +6252,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1vHhtns6gPgv2zdnViNHWKe69UlMziiY8ogzUIM-cXLA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3csRS0Q7OBQrJTgMfhnmBap6_YbZwht-0nasXx6jerRk= BufferLoad: true BufferStore: true CUCount: null @@ -6245,7 +6272,7 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -6255,7 +6282,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6267,36 +6294,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 4608 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 20992 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -6304,35 +6331,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -6350,23 +6377,23 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 + NumElementsPerBatchStore: 16 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -6376,7 +6403,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -6386,28 +6413,28 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6419,14 +6446,14 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 @@ -6450,10 +6477,11 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6461,20 +6489,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16axuOnARtwf3Zw_BH37e0vrXw3uX3YI2sxTIy41cp-BQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1G-Hk-cvpdoZpl53xXJtdFhfZ8BYYo4TdUsDuo483nkI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6491,7 +6519,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6503,34 +6531,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 14336 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 14336 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 14336 + LdsOffsetMetadata_Blk: 21504 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -6538,10 +6566,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -6552,14 +6580,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -6573,28 +6601,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 3 + NonTemporalB: 0 + NonTemporalC: 6 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6603,7 +6631,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6612,17 +6640,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB4_NTC1_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -6634,16 +6662,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -6655,29 +6683,29 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -6690,6 +6718,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6697,12 +6726,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT320x192x32_MI32tuSZnvvLkXYcMml-mOShaT9zCWrl5dAd9wDAlRsHyF0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1-xkfgfCFtFLl4KyFyWYpOSiCNaOYMrDlymFG759qbfs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -6722,7 +6751,7 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 + GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -6739,36 +6768,36 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 LSPA: 32 - LSPB: 16 + LSPB: 32 LVCA: 8 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 + LdsBytesNoAmax: 30720 LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 30720 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 119808 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 119808 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -6776,11 +6805,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -6788,23 +6817,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 3] - MIWaveTileA: 5 - MIWaveTileB: 3 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -6816,21 +6845,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 240 - NumLoadsA: 10 - NumLoadsB: 12 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -6839,7 +6868,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -6848,32 +6877,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB1_NTC5_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 3 - ThreadTileA: 80 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -6892,14 +6921,14 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -6913,7 +6942,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -6926,6 +6955,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -6933,20 +6963,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1j604NGXToiBb5XPUVM0hkt4WXLh6NV34LIceG_D6gTg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3fwCm5jm6HmwYFF2F9NEH2-i6ut4jdo_U1ZXcaGHnHUQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -6963,7 +6993,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -6975,99 +7005,99 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 28864 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 28864 + LdsNumElementsAlignedA: 4160 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4160 + LdsOffsetB_Blk: 20544 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 4160 + LdsOffsetMetadata_Blk: 20544 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 + NonTemporalA: 0 + NonTemporalB: 3 NonTemporalC: 6 - NonTemporalD: 7 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 - NumThreads: 256 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -7075,7 +7105,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -7084,38 +7114,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB1_NTC6_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 64 + SubGroupA: 2 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -7127,16 +7157,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -7162,6 +7192,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7169,27 +7200,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1leQ8mjkIFyIZuDPwBWpvxiJZrseNgqKruLAXi8hCJNQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x1qqLf9V2sWUhTTjo08OvLbGEZOd8IOsdSVoXLWBlvt8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 @@ -7199,7 +7230,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7211,34 +7242,34 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 57344 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 57344 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7246,10 +7277,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -7260,14 +7291,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7281,28 +7312,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 4 + NonTemporalB: 2 + NonTemporalC: 6 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7310,7 +7341,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -7320,7 +7351,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA2_NTB1_NTC4_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -7330,7 +7361,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -7342,10 +7373,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7363,8 +7394,8 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -7376,10 +7407,10 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7398,6 +7429,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7405,20 +7437,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x32_MI32xt2IWy6oc3iaHRd0qwAxR-vlfdl6TmzLLHgr_zeUr1t0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x2tRUZll8ei_2fEcBgvm8QHlf-NZKX4GlNtbfPsV0B7Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7435,7 +7467,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7447,48 +7479,48 @@ SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -7496,49 +7528,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 1 NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalC: 6 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7556,32 +7588,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB3_NTC7_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 1 - ThreadTileA: 32 - ThreadTileB: 1 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -7599,23 +7631,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7632,8 +7664,9 @@ tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7641,7 +7674,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xUaGw_W8LDx3RieUoB0YjL0FezgOAUjz28DCpyFICrg4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16DbJizEd-UmOez0mMSJd0N_uM7eLvQVu0Z44HBDIDLQQ= BufferLoad: true BufferStore: true CUCount: null @@ -7651,10 +7684,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -7664,6 +7697,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7671,7 +7705,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -7680,51 +7714,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -7732,49 +7766,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -7792,42 +7826,43 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC4_NTD7_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -7835,23 +7870,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -7862,14 +7897,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -7877,7 +7914,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xnPDI4eDL8B8YkIMGYr-3DuqBaS61ghWLUzLSEYUQwcQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16hJIai0BInLZV7hgR5cDeaVnCphwevySseItKMzY2S58= BufferLoad: true BufferStore: true CUCount: null @@ -7900,6 +7937,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -7916,10 +7954,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC5_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB7_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -7932,21 +7970,21 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 116224 + LdsNumBytes: 135168 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 67584 LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -7959,7 +7997,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -7968,14 +8006,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -7996,21 +8034,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 5 - NonTemporalD: 0 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8028,7 +8066,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 33 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA1_NTB5_NTC5_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB7_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 @@ -8036,7 +8074,7 @@ StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 @@ -8051,19 +8089,20 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8072,7 +8111,7 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -8080,7 +8119,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -8098,14 +8137,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8113,20 +8154,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI167teIb582yCf_SJBIlrVLw1F7Ht3BtRFeb0kjf6Rcpyk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI162FQf5x0Ic9M0LiIiM9wZ1HB5yGNVuG3Z50nYqhZpdHA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8136,6 +8177,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8152,37 +8194,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8190,10 +8232,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -8203,15 +8245,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 4] + MIWaveTileA: 8 MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 256 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8225,27 +8267,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 4 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 6 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -8255,7 +8297,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8264,42 +8306,43 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 34 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC4_NTD2_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 32 ThreadTile1: 4 - ThreadTileA: 16 + ThreadTileA: 32 ThreadTileB: 4 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8314,34 +8357,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8349,20 +8394,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI323ndqhClmLf66x-DMa5vtrsXdR3sinuyraIFtI5pO3GY= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xA0Nndy-Gm0_-9jnJPOikGUN_FqV0gpTqShRRDXkv1uQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8372,6 +8417,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8388,27 +8434,27 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB6_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116736 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 116736 + LdsNumBytes: 116224 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -8419,20 +8465,20 @@ LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 33792 LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -8444,39 +8490,39 @@ MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalB: 7 + NonTemporalC: 7 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -8500,12 +8546,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 35 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB6_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM8_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -8514,17 +8560,17 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 64 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 64 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true @@ -8535,12 +8581,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -8550,34 +8597,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8585,7 +8634,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI162pz3v-E27-meies96c7sXE26vltpCAQ0pdpY2inCyGg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI162WYI2MxHIrVkq3FQOShW9mBomKJ0mhyfVvurs93vJTE= BufferLoad: true BufferStore: true CUCount: null @@ -8608,6 +8657,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8624,10 +8674,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -8640,21 +8690,21 @@ LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 69632 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 69632 + LdsNumBytes: 117760 LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 + LdsOffsetA_Blk: 65536 LdsOffsetB: 34816 - LdsOffsetB_Blk: 165888 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 69632 - LdsOffsetMetadata_Blk: 165888 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8676,14 +8726,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] + MIWaveTile: [8, 4] MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 256 - MacroTile1: 256 + MacroTile1: 128 MacroTileA: 256 - MacroTileB: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8704,21 +8754,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -8736,21 +8786,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 36 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC7_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 5 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -8759,9 +8809,9 @@ SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 32 - ThreadTileB: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -8771,7 +8821,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -8787,7 +8838,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 + WorkGroupMapping: 24 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -8806,14 +8857,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -8821,20 +8874,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16Rai2BMU5qUNE8ec14Xy8sjzYfYE4wTy8B1l9bpYhNTM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16z6K3Sb_rajFl7CvRhVdmX-td587lL_0kZQhA44LAkbg= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -8844,6 +8897,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -8860,37 +8914,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 67584 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 101376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 101376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -8898,10 +8952,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -8912,14 +8966,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -8933,22 +8987,22 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -8963,7 +9017,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -8972,21 +9026,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 37 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA1_NTB0_NTC6_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: true - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -8994,20 +9048,21 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9023,33 +9078,35 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9057,7 +9114,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x256x32_MI16_hmfRslJ7_UM9E5_Eh9cjKTPKXxOaGMYtgTpwcJhKMM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16S2etZF6YGoCywFVWpPuTkl82pskkkPSvkb_V_rvYnYM= BufferLoad: true BufferStore: true CUCount: null @@ -9080,6 +9137,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -9087,7 +9145,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9096,10 +9154,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -9109,24 +9167,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 125952 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 125952 - LdsNumElementsAlignedA: 25600 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -9148,13 +9206,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 8] - MIWaveTileA: 5 + MIWaveTile: [8, 8] + MIWaveTileA: 8 MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 256 MacroTile1: 256 - MacroTileA: 160 + MacroTileA: 256 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -9175,21 +9233,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 2 + NonTemporalB: 2 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 160 - NumGlobalWriteVectorsPerThread: 160 - NumLoadsA: 5 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -9208,21 +9266,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 38 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x256x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_8_MO40_NTn1_NTA3_NTB0_NTC1_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -9230,20 +9288,21 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 32 ThreadTile1: 8 - ThreadTileA: 20 + ThreadTileA: 32 ThreadTileB: 8 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9251,7 +9310,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -9259,8 +9318,8 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9278,14 +9337,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9293,12 +9354,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI32WgES1xrDYvB_Vsiz8nGLdWqeSq_GbrDnca8tIfV9V44= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16zhQEOYLLGxWCuOEX40LQg-IfZTSUsdq7e3ct5BY82X4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -9316,14 +9377,15 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -9332,39 +9394,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 41472 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 41472 - LdsNumElementsAlignedA: 23040 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 23040 - LdsOffsetB_Blk: 88576 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 41472 - LdsOffsetMetadata_Blk: 88576 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -9372,35 +9434,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9413,20 +9475,20 @@ NonTemporal: -1 NonTemporalA: 3 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9435,7 +9497,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9444,32 +9506,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 39 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_1_MO40_NTn1_NTA3_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 1 - ThreadTileA: 80 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9479,7 +9541,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -9487,16 +9550,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9509,19 +9572,21 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9529,7 +9594,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x32x64_MI16xSLoxTzKJhLeYO6yA7D7HcG6nMu1LARYTpZpldzWXL4U= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xhzz0mUXjdKncv9yWH36CWKfQiJUxDnvM6oldFtHiGPQ= BufferLoad: true BufferStore: true CUCount: null @@ -9552,6 +9617,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -9568,10 +9634,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA5_NTB7_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -9582,25 +9648,25 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 73728 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 73728 - LdsNumElementsAlignedA: 64512 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 64512 - LdsOffsetB_Blk: 195584 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 73728 - LdsOffsetMetadata_Blk: 195584 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -9608,11 +9674,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 4 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -9620,23 +9686,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [7, 1] - MIWaveTileA: 7 - MIWaveTileB: 1 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 32 - MacroTileA: 224 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -9647,22 +9713,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 + NonTemporalA: 7 NonTemporalB: 7 NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 28 - NumGlobalWriteVectorsPerThread: 28 - NumLoadsA: 14 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 14 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9680,32 +9746,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 40 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x32x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_1_MO40_NTn1_NTA5_NTB7_NTC0_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 1 - ThreadTileA: 28 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -9715,24 +9781,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9741,7 +9808,7 @@ _DepthUB: 64 _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9750,14 +9817,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -9765,29 +9834,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT224x160x32_MI16seqDTzLmKwfO-jGw_7q0e0kRLQ9kjh4yCHvuyvaH2cs= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xj7U-zgwpd4zPGOHREedIwdwjVu8pyLWNmqSZsXIQfiQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -9804,10 +9874,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -9817,38 +9887,38 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 59392 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 29568 - LdsNumElementsAlignedB: 21120 + LdsNumBytes: 59392 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 29568 - LdsOffsetB_Blk: 95104 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 29568 - LdsOffsetMetadata_Blk: 95104 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -9856,49 +9926,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [7, 5] - MIWaveTileA: 7 - MIWaveTileB: 5 + MIWaveTile: [1, 2] + MIWaveTileA: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 224 - MacroTile1: 160 - MacroTileA: 224 - MacroTileB: 160 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 0 + NonTemporalA: 0 + NonTemporalB: 3 + NonTemporalC: 1 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 140 - NumGlobalWriteVectorsPerThread: 140 - NumLoadsA: 7 - NumLoadsB: 5 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 7 - NumLoadsPerpendicularB: 5 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -9906,8 +9976,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -9916,59 +9986,60 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 41 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT224x160x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_5_MO40_NTn1_NTA1_NTB1_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 28 - ThreadTile1: 5 - ThreadTileA: 28 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -9977,7 +10048,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -9986,14 +10057,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10001,7 +10074,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16xN2l8jPNpvUrR9ihG-g4bbj4txPWtM9VLB2cEdM2wQDo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x256_MI16x9N4FL5Gr-S5lZKFQp99NVD859UtIa5XyMClMxXi3-_8= BufferLoad: true BufferStore: true CUCount: null @@ -10011,7 +10084,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 256 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -10024,6 +10097,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10031,7 +10105,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10040,50 +10114,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB7_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 133120 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 133120 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 66560 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 99840 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 99840 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -10091,10 +10165,10 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 32 MacroTile1: 32 @@ -10119,22 +10193,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 3 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 8 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10152,32 +10226,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 42 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB6_NTC1_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB7_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false + StoreSwapAddr: true StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10187,7 +10261,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -10195,23 +10270,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -10222,14 +10297,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10237,29 +10314,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1KDrJ4Ic4uP1T91fz80L-1p9uxPjaRg8FouAoxpGOl90= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI16xDXqX3nOI1ws82SuzK10Xb7Lyblmqqz7YODwj8fdf6Ec= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10267,7 +10345,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10276,10 +10354,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -10289,24 +10367,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 + LdsBytesNoAmax: 152064 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 152064 + LdsNumElementsAlignedA: 8448 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 76032 + LdsOffsetB: 8448 + LdsOffsetB_Blk: 84480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 8448 + LdsOffsetMetadata_Blk: 84480 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -10314,8 +10392,8 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false @@ -10327,15 +10405,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -10349,28 +10427,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 + NonTemporalA: 0 + NonTemporalB: 5 NonTemporalC: 7 - NonTemporalD: 4 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -10378,8 +10456,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10388,42 +10466,43 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 43 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -10431,16 +10510,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10453,19 +10532,21 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10473,12 +10554,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3u9DB_8S6rR7nconKZLZIT-er2SW2W92RF0q4O62ZiGo= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x320x32_MI16xEDT-TS7XsVhCqRrDXPaaM9Dw-865cKdJ2bN2UQP64eM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -10493,9 +10574,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10503,7 +10585,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -10512,39 +10594,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA2_NTB5_NTC0_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -10552,35 +10634,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -10591,31 +10673,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 10 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -10624,32 +10706,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 44 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC5_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA2_NTB5_NTC0_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 5 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -10659,24 +10741,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -10685,7 +10768,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10694,14 +10777,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10709,7 +10794,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3Jm5CiGdfbFXujnDqBqMXTdu4rLuafV0fP7zUlp_z2K4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16x7eJMtzuep7ti0IbbweWe307G-lT2FmDeeBlMvdoLAfg= BufferLoad: true BufferStore: true CUCount: null @@ -10719,19 +10804,20 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 128 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10748,109 +10834,109 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 99328 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 99328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 16896 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] + MIWaveGroup: [2, 2] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 4 + NonTemporalA: 2 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -10860,7 +10946,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 45 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC7_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -10875,32 +10961,33 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -10910,18 +10997,18 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -10930,14 +11017,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -10945,7 +11034,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3csRS0Q7OBQrJTgMfhnmBap6_YbZwht-0nasXx6jerRk= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xByhPKgVjq4CDyWj-cbHC-i4QzDiSS4uWjZj1qyr1xvg= BufferLoad: true BufferStore: true CUCount: null @@ -10965,9 +11054,10 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -10984,39 +11074,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30208 + LdsBytesNoAmax: 100352 LdsInitCVgprs: false - LdsNumBytes: 30208 - LdsNumElementsAlignedA: 4608 + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 25600 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 20992 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 25600 + LdsOffsetMetadata_Blk: 91136 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -11024,35 +11114,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 2] + MIWaveTileA: 5 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 160 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 160 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -11063,31 +11153,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 2 + NonTemporalB: 3 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 40 + NumLoadsA: 5 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11096,7 +11186,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 46 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11111,17 +11201,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 20 + ThreadTile1: 2 + ThreadTileA: 20 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11131,24 +11221,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11157,7 +11248,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -11166,14 +11257,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11181,20 +11274,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1G-Hk-cvpdoZpl53xXJtdFhfZ8BYYo4TdUsDuo483nkI= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xd-sQFx5RImj-B93gHPT59pdW-PfpHkRagGWfDrPaTnY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11204,6 +11297,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -11211,7 +11305,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11220,37 +11314,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14336 + LdsBytesNoAmax: 116224 LdsInitCVgprs: false - LdsNumBytes: 14336 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 116224 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14336 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11258,10 +11352,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -11272,13 +11366,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 128 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -11293,28 +11387,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 7 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -11323,7 +11417,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -11332,21 +11426,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 47 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA3_NTB0_NTC6_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -11354,9 +11448,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -11367,15 +11461,16 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true - UseDotInstruction: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -11384,32 +11479,34 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11417,7 +11514,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI16x1-xkfgfCFtFLl4KyFyWYpOSiCNaOYMrDlymFG759qbfs= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1-95uwA7thjYZN8Q0bg1wWhpCED9_VDGCoINGlVsybm0= BufferLoad: true BufferStore: true CUCount: null @@ -11440,6 +11537,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -11447,7 +11545,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11456,10 +11554,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -11469,24 +11567,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 30720 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 30720 - LdsNumElementsAlignedA: 5120 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 9216 LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11508,13 +11606,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -11535,21 +11633,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 7 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -11568,7 +11666,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 48 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -11578,7 +11676,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -11590,9 +11688,9 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -11603,7 +11701,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -11611,7 +11710,7 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -11620,7 +11719,7 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11638,14 +11737,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11653,20 +11754,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x32_MI32x3fwCm5jm6HmwYFF2F9NEH2-i6ut4jdo_U1ZXcaGHnHUQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32NrolHpaoNMGubHMXg2QZmfmFTUHLarzXy0RPUp34mNo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -11676,6 +11777,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -11683,7 +11785,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11692,37 +11794,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 28864 + LdsBytesNoAmax: 100352 LdsInitCVgprs: false - LdsNumBytes: 28864 - LdsNumElementsAlignedA: 4160 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 100352 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4160 - LdsOffsetB_Blk: 20544 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4160 - LdsOffsetMetadata_Blk: 20544 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -11730,12 +11832,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -11743,15 +11845,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -11765,29 +11867,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 1 + NonTemporalB: 1 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -11804,32 +11906,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 49 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB3_NTC6_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 + StreamKXCCMapping: 4 + SubGroup0: 4 SubGroup1: 64 - SubGroupA: 2 + SubGroupA: 4 SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 32 + ThreadTile1: 2 + ThreadTileA: 32 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -11839,24 +11941,25 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -11865,23 +11968,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -11889,29 +11994,30 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x1qqLf9V2sWUhTTjo08OvLbGEZOd8IOsdSVoXLWBlvt8= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI16K1w2va1wKhvgZoTyL5zX8YLee7JqsF5V6wtSMY1I-GE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -11919,7 +12025,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -11928,37 +12034,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB6_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -11966,10 +12072,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -11979,15 +12085,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] + MIWaveTileA: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12001,28 +12107,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalA: 1 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12030,8 +12136,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12040,42 +12146,43 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 50 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA2_NTB2_NTC6_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB6_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -12083,41 +12190,43 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: false + ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12125,7 +12234,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x64x128_MI16x2tRUZll8ei_2fEcBgvm8QHlf-NZKX4GlNtbfPsV0B7Y= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3HCPsg4TodlR6Q8cd0m5gwDltLOtlMg4yX28q5bkM3nc= BufferLoad: true BufferStore: true CUCount: null @@ -12135,7 +12244,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 + DepthU: 32 DirectToLds: true DirectToLdsA: true DirectToLdsB: true @@ -12148,6 +12257,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -12164,39 +12274,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 49408 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 49408 + LdsNumElementsAlignedA: 8320 + LdsNumElementsAlignedB: 8320 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8320 + LdsOffsetB_Blk: 41088 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8320 + LdsOffsetMetadata_Blk: 41088 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -12204,11 +12314,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: true LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -12216,23 +12326,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -12245,20 +12355,20 @@ NonTemporal: -1 NonTemporalA: 1 NonTemporalB: 3 - NonTemporalC: 6 + NonTemporalC: 5 NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12276,7 +12386,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 51 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x64x128_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA1_NTB3_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -12291,53 +12401,54 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -12346,6 +12457,7 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: false @@ -12354,6 +12466,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12361,20 +12474,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16DbJizEd-UmOez0mMSJd0N_uM7eLvQVu0Z44HBDIDLQQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI16ZwWemFSm3p4igCb8DLsnt6R0foWhVdW4rDKTnbvNCrM= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12384,7 +12497,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -12404,34 +12517,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 117760 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 117760 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 100352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12439,10 +12552,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -12452,14 +12565,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] + MIWaveGroup: [4, 1] + MIWaveTile: [4, 8] MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 256 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 256 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -12474,28 +12587,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 0 + NonTemporalB: 1 + NonTemporalC: 5 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12504,7 +12617,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12513,7 +12626,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 52 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB6_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -12521,30 +12634,30 @@ StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12564,16 +12677,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -12584,15 +12697,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12600,20 +12714,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16hJIai0BInLZV7hgR5cDeaVnCphwevySseItKMzY2S58= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI1616MP9_S2iGj4HSyT5jYf0jksUCFwInC_pUDeGGq21_Y= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -12623,9 +12737,9 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false + ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer @@ -12643,34 +12757,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB7_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 139264 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 139264 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 69632 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 104448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 104448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -12678,10 +12792,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -12692,14 +12806,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12713,27 +12827,27 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 + NonTemporalA: 0 + NonTemporalB: 2 NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 32 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 32 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -12743,7 +12857,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -12752,12 +12866,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 53 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA1_NTB7_NTC3_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: true @@ -12774,16 +12888,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -12804,34 +12918,35 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -12839,7 +12954,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x256x32_MI162FQf5x0Ic9M0LiIiM9wZ1HB5yGNVuG3Z50nYqhZpdHA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI1632V6Pxwe_725MnjTV9-vmbwgaM0uPWz-Wc0cWfNZEtw= BufferLoad: true BufferStore: true CUCount: null @@ -12862,7 +12977,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -12882,7 +12997,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -12895,11 +13010,11 @@ LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 117760 + LdsNumBytes: 34816 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -12908,7 +13023,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 + LdsOffsetMetadata: 34816 LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 @@ -12930,15 +13045,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [8, 4] - MIWaveTileA: 8 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 256 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 256 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -12958,22 +13073,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 6 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 7 + NonTemporalD: 6 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -12991,38 +13106,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 54 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA2_NTB5_NTC6_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 4 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13032,7 +13147,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -13042,9 +13157,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -13053,16 +13168,16 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -13071,6 +13186,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13078,7 +13194,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1H-2kp_jB-adLzEFBXJF42UhvA7Xsaxxj-pEwhy1dSyQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16q_1WHADry2cTxAoNmb44qR0nw6Q5GfHEK9JlilAlfio= BufferLoad: true BufferStore: true CUCount: null @@ -13088,7 +13204,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -13109,7 +13225,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13121,34 +13237,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB2_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 25600 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -13158,8 +13274,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -13170,14 +13286,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13197,21 +13313,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 2 - NonTemporalC: 6 - NonTemporalD: 5 + NonTemporalC: 0 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 5 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -13221,7 +13337,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13230,7 +13346,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 55 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB2_NTC6_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -13239,8 +13355,8 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -13252,16 +13368,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13271,28 +13387,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -13310,6 +13426,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13317,17 +13434,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3FEJ1XNoJafcDx9fHEvs_ccmeJEo5H77-RB7nM5avMyc= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1Tt22kH2e9CMY6k_Y8O_l57cXp8oRQ2KAhNqC8vG9-is= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -13360,36 +13477,36 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 + LVCA: 8 + LVCB: 8 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 9728 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 9728 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 9728 + LdsOffsetMetadata_Blk: 21504 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -13397,35 +13514,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -13436,23 +13553,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalA: 5 + NonTemporalB: 4 + NonTemporalC: 0 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13460,7 +13577,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -13469,7 +13586,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 56 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB1_NTC4_NTD6_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -13478,29 +13595,29 @@ StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -13514,22 +13631,22 @@ Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -13545,10 +13662,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13556,27 +13674,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xA0Nndy-Gm0_-9jnJPOikGUN_FqV0gpTqShRRDXkv1uQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1tPRLVkTDR1Eck8xeuIldFLH6hKJV_G81Iz6F9AruisE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -13587,7 +13705,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13599,34 +13717,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 9216 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 9216 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 20992 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -13634,12 +13752,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13647,15 +13765,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13669,36 +13787,36 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 + NonTemporalA: 6 + NonTemporalB: 2 NonTemporalC: 7 - NonTemporalD: 0 + NonTemporalD: 7 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -13708,31 +13826,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 57 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB7_NTC7_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSyncOpt: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true @@ -13752,29 +13870,29 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -13782,12 +13900,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -13795,7 +13914,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI162WYI2MxHIrVkq3FQOShW9mBomKJ0mhyfVvurs93vJTE= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x48x32_MI16x77c5vCqhF37wngAzGqfnFLKmA0iPzLeWaJiRYnuXPkU= BufferLoad: true BufferStore: true CUCount: null @@ -13818,7 +13937,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -13826,7 +13945,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -13838,34 +13957,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x48x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 33280 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 33280 + LdsNumElementsAlignedA: 25600 + LdsNumElementsAlignedB: 7680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 25600 + LdsOffsetB_Blk: 91136 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 91136 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -13878,7 +13997,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -13886,15 +14005,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 4] - MIWaveTileA: 8 - MIWaveTileB: 4 + MIWaveGroup: [2, 1] + MIWaveTile: [5, 3] + MIWaveTileA: 5 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 160 + MacroTile1: 48 + MacroTileA: 160 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -13915,22 +14034,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 7 + NonTemporalB: 4 NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 60 + NumGlobalWriteVectorsPerThread: 60 + NumLoadsA: 10 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 3 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -13947,32 +14066,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 58 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB7_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM5_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x48x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 256 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreSyncOpt: 1 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 5 + StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 20 + ThreadTile1: 3 + ThreadTileA: 20 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -13988,19 +14107,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14009,24 +14128,25 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14034,27 +14154,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x64_MI16z6K3Sb_rajFl7CvRhVdmX-td587lL_0kZQhA44LAkbg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x448x32_MI16kicIlK4eREQNKFD_4qwLZk8EPghmo4TEg7fAcQxRmDY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -14077,34 +14197,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 89088 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 89088 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 71680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 67584 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 101376 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 101376 + LdsOffsetMetadata: 89088 + LdsOffsetMetadata_Blk: 148480 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -14112,10 +14232,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -14125,15 +14245,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 7] + MIWaveTileA: 8 + MIWaveTileB: 7 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 448 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -14147,28 +14267,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 1 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 2 + NonTemporalC: 0 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 224 + NumGlobalWriteVectorsPerThread: 56 + NumLoadsA: 4 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 14 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14176,8 +14296,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14186,38 +14306,38 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 59 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA3_NTB7_NTC1_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 0 + StoreSwapAddr: false + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 7 + ThreadTileA: 32 + ThreadTileB: 7 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14227,28 +14347,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -14260,12 +14380,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14273,12 +14394,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16S2etZF6YGoCywFVWpPuTkl82pskkkPSvkb_V_rvYnYM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32WmWOeC1HZXF52s2qw5enQVIcphzfzkpjTKp5gB44mAw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -14296,7 +14417,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -14304,7 +14425,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14316,7 +14437,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -14326,26 +14447,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadA: 128 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 83968 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -14353,35 +14474,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14392,22 +14513,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalA: 4 + NonTemporalB: 1 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 256 + NumElementsPerBatchStore: 16 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14416,7 +14537,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14425,21 +14546,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 60 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA2_NTB2_NTC2_NTD3_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -14447,16 +14568,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -14466,19 +14587,19 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 1 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 1 + WorkGroup: [128, 2, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14487,7 +14608,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -14496,7 +14617,7 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -14505,6 +14626,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14512,12 +14634,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI16zhQEOYLLGxWCuOEX40LQg-IfZTSUsdq7e3ct5BY82X4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI32xLh-ycjs1oSwZbXiz2hnStK9M-0nCgCFZS7lNjPV_78M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' @@ -14535,15 +14657,14 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14552,39 +14673,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 109056 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 + LdsNumBytes: 109056 + LdsNumElementsAlignedA: 8704 LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -14592,35 +14713,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 1 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 256 + MacroTile0: 64 MacroTile1: 256 - MacroTileA: 256 + MacroTileA: 64 MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -14631,21 +14752,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 7 + NonTemporalC: 3 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 @@ -14655,7 +14776,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -14664,32 +14785,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 61 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA3_NTB0_NTC0_NTD0_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 128 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 32 - ThreadTileB: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -14699,8 +14820,7 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -14708,16 +14828,16 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 6 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -14730,20 +14850,20 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14751,7 +14871,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT48x192x64_MI16xM9LgWSLPQURN0LQ8_cZ_x-ShHUsOQ4UKzbLGoJT6DOA= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xB_DE9RVLGbG_PwgSFrYUPdM2k3sTJC_ftD4lxU9bY6g= BufferLoad: true BufferStore: true CUCount: null @@ -14761,10 +14881,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 32 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -14774,15 +14894,14 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 1 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -14791,51 +14910,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA2_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 69120 + LdsBytesNoAmax: 150528 LdsInitCVgprs: false - LdsNumBytes: 69120 - LdsNumElementsAlignedA: 13824 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 150528 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 66560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 13824 - LdsOffsetB_Blk: 144896 + LdsOffsetA_Blk: 75264 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 83968 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 69120 - LdsOffsetMetadata_Blk: 144896 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 83968 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true LoopIters: 2 - LoopUnroll: 64 + LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -14843,49 +14962,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [3, 3] - MIWaveTileA: 3 - MIWaveTileB: 3 + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 48 - MacroTile1: 192 - MacroTileA: 48 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 2 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalB: 6 + NonTemporalC: 4 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 36 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 3 - NumLoadsB: 12 + NumElementsPerBatchStore: 2 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -14903,69 +15022,68 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 62 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x192x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA2_NTB2_NTC5_NTD7_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 12 - ThreadTile1: 3 - ThreadTileA: 12 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -14974,15 +15092,15 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -14990,7 +15108,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI160gzIbQL4-jpzPH-VghbnvNdTTqRF2iTl-Bj4jYdi7Kc= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xNPMvn-evBNCDHAzhSPgPljENZ_iAGw5fkD0ulDNYMWc= BufferLoad: true BufferStore: true CUCount: null @@ -15010,10 +15128,9 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -15021,7 +15138,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15030,10 +15147,10 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA3_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 32 LSCB: 32 @@ -15043,26 +15160,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 76288 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 76288 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 91136 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 76288 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -15070,35 +15187,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 + LoopIters: 2 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 4] + MIWaveTileA: 2 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 512 + MacroTileA: 64 + MacroTileB: 512 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -15110,21 +15227,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalB: 6 + NonTemporalC: 5 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 - NumLoadsB: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15132,8 +15249,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -15142,31 +15259,31 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 63 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA3_NTB3_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreSyncOpt: 0 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 32 ThreadTile1: 4 - ThreadTileA: 20 + ThreadTileA: 32 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -15177,16 +15294,15 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -15194,7 +15310,7 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 24 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false @@ -15204,24 +15320,24 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 1 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15229,17 +15345,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI32xhzz0mUXjdKncv9yWH36CWKfQiJUxDnvM6oldFtHiGPQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x384x32_MI32xJv85LAZjXvrheXhR6Pb2-1BpR8Zhns4NnJ21cIAl2Hw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 + DepthU: 32 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15252,7 +15368,6 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -15260,7 +15375,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15269,37 +15384,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 129536 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 129536 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 8704 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 4 LdsPadB: 4 LdsPadMetadata: 0 @@ -15309,10 +15424,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 2 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -15320,15 +15435,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 384 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15348,22 +15463,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 7 - NonTemporalC: 0 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 1 NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 96 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 2 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -15381,69 +15496,68 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 64 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA7_NTB7_NTC0_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 1 - StoreVectorWidth: 1 + StoreVectorWidth: 2 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 4 + SubGroup0: 2 + SubGroup1: 128 + SubGroupA: 2 + SubGroupB: 128 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 32 + ThreadTile1: 3 + ThreadTileA: 32 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -15452,15 +15566,15 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 1 + - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15468,20 +15582,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xXqj08Ftg00Rv2SJi2FzFlO6u1HdA7oX4HR8160_tNf0= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xW4r92vEXVihDdHo-lHvJ0uNEpIQK9BpFRUZ08nMbzCs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 256 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -15491,7 +15605,6 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -15508,95 +15621,95 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC4_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 115712 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 115712 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33280 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [2, 1] MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 + NonTemporalA: 7 NonTemporalB: 1 - NonTemporalC: 4 - NonTemporalD: 6 + NonTemporalC: 1 + NonTemporalD: 5 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -15620,12 +15733,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 65 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA6_NTB1_NTC4_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -15636,15 +15749,15 @@ StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 1 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true @@ -15655,13 +15768,12 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -15671,16 +15783,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -15691,15 +15803,15 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true + tailLoopOptA: false + tailLoopOptB: false - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15707,7 +15819,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x32_MI16x_7t2JVerTtHT1wEv8es6jrjUmRGlaHN5v8t1PAlVSK4= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xwGhVm6bClKczrRVvyM1JDrgb5E0uKGXdH1TpIh1TL0o= BufferLoad: true BufferStore: true CUCount: null @@ -15717,7 +15829,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -15730,7 +15842,6 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -15738,7 +15849,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -15747,48 +15858,48 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 39424 + LdsBytesNoAmax: 17408 LdsInitCVgprs: false - LdsNumBytes: 39424 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 17408 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 70144 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 41472 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 39424 - LdsOffsetMetadata_Blk: 70144 + LdsOffsetMetadata: 17408 + LdsOffsetMetadata_Blk: 41472 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -15798,15 +15909,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -15826,23 +15937,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 4 NonTemporalB: 1 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalC: 1 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 8 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -15859,7 +15970,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 66 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB1_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -15869,59 +15980,58 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - TransposeLDS: 1 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 4, 2] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -15930,7 +16040,6 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true @@ -15939,6 +16048,7 @@ ActivationAlt: false ActivationFuncCall: true ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -15946,19 +16056,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI32xj7U-zgwpd4zPGOHREedIwdwjVu8pyLWNmqSZsXIQfiQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x16x32_MI16x1TWdBy3MnQQKMOSifSRrwSeieAjAz_Ipy8XM9t_apH8I= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false @@ -15966,13 +16076,12 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 2 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -15986,75 +16095,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 8 + LSPB: 4 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 29504 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 29504 + LdsNumElementsAlignedA: 10560 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 10560 + LdsOffsetB_Blk: 26944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 10560 + LdsOffsetMetadata_Blk: 26944 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [5, 1] + MIWaveTileA: 5 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 80 + MacroTile1: 16 + MacroTileA: 80 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -16065,31 +16174,31 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 3 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 4 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 10 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 10 NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16098,32 +16207,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 67 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB3_NTC1_NTD1_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC32_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 7 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 20 + ThreadTile1: 1 + ThreadTileA: 20 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16133,25 +16242,24 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -16160,24 +16268,24 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 0 + _staggerStrideShift: 2 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true + tailLoopOptA: false tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16185,20 +16293,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x256_MI16x9N4FL5Gr-S5lZKFQp99NVD859UtIa5XyMClMxXi3-_8= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x7fd2kUwjm75D0kEujcLkV3_GTKT5wWnTcae2Fu7jWuw= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -16208,7 +16316,6 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -16225,48 +16332,48 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB7_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 2048 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 133120 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 133120 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 33280 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 66560 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 99840 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 99840 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -16276,15 +16383,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16298,28 +16405,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 3 - NonTemporalB: 7 - NonTemporalC: 0 + NonTemporalB: 6 + NonTemporalC: 1 NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerBatchStore: 14 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16337,32 +16444,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 68 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB7_NTC0_NTD2_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC16_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 4 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 2 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16371,9 +16478,8 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false - UseDirect32XEmulation: true - UseDot2F32XEmulation: false + UseCustomMainLoopSchedule: 1 + UseDot2F32XEmulation: true UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -16382,22 +16488,22 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -16408,15 +16514,15 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16424,7 +16530,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI16xR68Z4R7jyAOkgwfr2W7Y3XG7smz3NAVO_kD672DuYEg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x80x128_MI16xEh4DCgEFCt-afK9cUIQCk_5N_QizF04nbCdMOaWqSgw= BufferLoad: true BufferStore: true CUCount: null @@ -16434,7 +16540,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -16444,18 +16550,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 0 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16467,34 +16573,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43520 + LdsBytesNoAmax: 156672 LdsInitCVgprs: false - LdsNumBytes: 43520 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 156672 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 43520 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 78336 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 113152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43520 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 113152 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -16504,8 +16610,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -16515,15 +16621,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [4, 1] + MIWaveTile: [1, 5] + MIWaveTileA: 1 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 256 + MacroTile1: 80 MacroTileA: 64 - MacroTileB: 256 + MacroTileB: 80 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16544,21 +16650,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 2 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 20 + NumGlobalWriteVectorsPerThread: 20 NumLoadsA: 8 - NumLoadsB: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16566,8 +16672,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -16576,32 +16682,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 69 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB5_NTC1_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 1 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSwapAddr: true + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 - SuppressNoLoadLoop: false - SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 5 + ThreadTileA: 4 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -16610,35 +16716,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: -1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [64, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 2 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -16654,8 +16760,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16663,7 +16770,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x32_MI16xhRkmfbWaZLauRvxHEZRkWypKxx8tWVFg5-SELIV8PaU= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xIjYsJ7nPzSNt9gcCFNyYcDjxOwbmoQKbvftRgAY1G4I= BufferLoad: true BufferStore: true CUCount: null @@ -16673,7 +16780,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -16694,7 +16801,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16706,34 +16813,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 27136 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 27136 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 27136 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -16743,10 +16850,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -16754,15 +16861,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -16782,22 +16889,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 2 NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 3 + NonTemporalC: 1 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -16815,8 +16922,8 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 70 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA3_NTB5_NTC7_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 @@ -16829,27 +16936,27 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 6 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false @@ -16859,23 +16966,23 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -16891,10 +16998,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -16902,20 +17010,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI16xDXqX3nOI1ws82SuzK10Xb7Lyblmqqz7YODwj8fdf6Ec= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xG8s9vcohdFWaEPW_OKAxb_cYdQ6B4AQZ2nx9kjNOYto= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -16925,7 +17033,7 @@ ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 @@ -16933,7 +17041,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -16945,34 +17053,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152064 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 152064 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76032 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 84480 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 84480 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -16980,10 +17088,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -16993,15 +17101,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 512 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 512 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17015,28 +17123,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 5 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalA: 3 + NonTemporalB: 6 + NonTemporalC: 0 + NonTemporalD: 1 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17045,7 +17153,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17054,32 +17162,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 71 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB5_NTC7_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 16 + StaggerU: 8 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 1 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -17088,7 +17196,7 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false @@ -17098,42 +17206,43 @@ UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17141,7 +17250,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x320x32_MI16xEDT-TS7XsVhCqRrDXPaaM9Dw-865cKdJ2bN2UQP64eM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x34z5jA9IIcOhaYTxrREvShaRXUEMAbqXkdSTqaoGUr0= BufferLoad: true BufferStore: true CUCount: null @@ -17151,7 +17260,7 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 128 DirectToLds: 0 DirectToLdsA: false DirectToLdsB: false @@ -17172,7 +17281,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17184,34 +17293,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA2_NTB5_NTC0_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 51200 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 59904 - LdsOffsetMetadata_Blk: 74240 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -17221,10 +17330,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -17232,15 +17341,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 320 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 320 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17260,22 +17369,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 6 + NonTemporalC: 2 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 2 - NumLoadsB: 10 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17284,7 +17393,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17293,69 +17402,69 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 72 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_5_MO40_NTn1_NTA2_NTB5_NTC0_NTD7_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC2_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 1 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 6 WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -17369,10 +17478,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17380,27 +17490,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI16x1ClaJAhs9bAnetPpejtwWnPhg54ltN5Lq3cuNtacQMlU= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x9MB_BItjxFkx0YR1b98tLWirgDOvhTRoD-0flsTcQhA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -17411,7 +17521,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -17423,34 +17533,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 86016 LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 86016 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 52224 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 + LdsOffsetMetadata: 86016 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -17458,11 +17568,11 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 @@ -17472,14 +17582,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -17493,28 +17603,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 1 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 8 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17522,7 +17632,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -17532,17 +17642,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 73 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA3_NTB0_NTC5_NTD5_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 + StaggerUStride: 512 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -17554,29 +17664,29 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -17584,17 +17694,17 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 0 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -17606,12 +17716,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17619,27 +17730,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1sKWcpwZgqDGbasNzEHbLdPIjLWTrCFpI8w8F6Ex2Yns= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI32o1cp8AHeO1wh-BhDUt3-ptcU4Mtb1M0g3Gvr_oirH9Q= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -17662,48 +17773,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 101376 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 + LdsOffsetA_Blk: 262144 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 295936 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 135168 + LdsOffsetMetadata_Blk: 295936 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [16, 16, 32, 1, 1, 1] + MIArchVgpr: 0 + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -17711,49 +17822,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveTile: [1, 3] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 4 + NonTemporalA: 3 + NonTemporalB: 5 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 24 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 24 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -17761,8 +17872,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -17771,41 +17882,41 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 74 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false @@ -17822,16 +17933,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 48 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -17845,12 +17956,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -17858,27 +17970,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1EqYOzUyY9ygjHWYmTZonE5kaamHc6F8AcaxmECEx-PQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI323I4N4iBo8DW3RIRd0BHqSQeGBeqa7daq5LoZFfvEd3I= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -17901,48 +18013,48 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24832 + LdsBytesNoAmax: 135168 LdsInitCVgprs: false - LdsNumBytes: 24832 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 4224 + LdsNumBytes: 135168 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 101376 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 20608 + LdsOffsetA_Blk: 262144 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 295936 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 20608 - LdsPadA: 8 - LdsPadB: 8 + LdsOffsetMetadata: 135168 + LdsOffsetMetadata_Blk: 295936 + LdsPadA: 4 + LdsPadB: 4 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 128 MFMA_BF16_1K: false MIArchVgpr: 0 - MIBlock: [16, 16, 32, 1, 1, 1] + MIBlock: [32, 32, 16, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -17950,49 +18062,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveTile: [1, 3] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 192 + MacroTileA: 64 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 32 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 32, 1] + MatrixInstK: 16 + MatrixInstM: 32 + MatrixInstN: 32 + MatrixInstruction: [32, 32, 16, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 2 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 2 + NonTemporalD: 3 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 8 + NumLoadsB: 24 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 24 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -18000,8 +18112,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -18010,41 +18122,41 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 75 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA3_NTB2_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 8 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 + StoreSyncOpt: 1 StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: false + UseCustomMainLoopSchedule: 1 UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false @@ -18061,16 +18173,16 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingXCC: 16 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 0 _VectorStore: 1 @@ -18084,12 +18196,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18097,27 +18210,27 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16x8EUEagTvWzblDujb4a6-sHj4LC2-vadXjUhTArIYoxM= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1Fd3Wgq88OZ1PaojzfzjpDsCp8WKZiHDJo9GXAXrgRK0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -18140,34 +18253,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18175,12 +18288,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18188,14 +18301,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] + MIWaveGroup: [4, 1] + MIWaveTile: [1, 2] MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -18210,36 +18323,38 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -18249,7 +18364,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 76 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA1_NTB0_NTC6_NTD6_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -18264,17 +18379,17 @@ StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] ThreadTile0: 4 - ThreadTile1: 1 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 1 + ThreadTileB: 2 TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18288,28 +18403,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 32 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -18323,12 +18441,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18336,20 +18455,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x64_MI16x1EDz1q2jKSyQTb112LxnuUuMbJQuylVJAloo1XWaXWOY= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x64x64_MI16x1oxplz8DgKAzzHI9atKMOCqO_fHvi4aY6eI_59fmy5x0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18379,7 +18498,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -18389,24 +18508,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49664 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 49664 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 8448 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 41216 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 41216 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18414,12 +18533,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18427,15 +18546,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 4] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18449,29 +18568,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 - NonTemporalD: 4 + NonTemporalC: 0 + NonTemporalD: 0 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 8 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18488,13 +18609,13 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 77 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA2_NTB0_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 0 + StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 @@ -18502,11 +18623,11 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 8 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] @@ -18527,6 +18648,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 @@ -18539,9 +18663,9 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -18562,12 +18686,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18575,20 +18700,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI16x1UXoAKHnq0-WZoXPcpEIvd8tVnS0VOqSLDnuTlvOr2KI= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x64_MI16HbQjL6Xiy4_bwYKGnfEzEBO73K7v6GCjIcyS997kha4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -18606,7 +18731,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18618,7 +18743,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -18628,24 +18753,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 79872 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18653,12 +18778,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18667,14 +18792,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18688,29 +18813,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 7 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -18727,21 +18854,21 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 78 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA2_NTB3_NTC7_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 1 + StaggerUStride: 0 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 2 + StoreSyncOpt: 4 + StoreVectorWidth: 1 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -18749,10 +18876,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -18766,21 +18893,24 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMapping: 2 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -18801,12 +18931,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -18814,7 +18945,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x128_MI16x7eJMtzuep7ti0IbbweWe307G-lT2FmDeeBlMvdoLAfg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16x0cYNusgHgF3Co_ShFYNVgf_9fASJQBYDTRah_BV_zEY= BufferLoad: true BufferStore: true CUCount: null @@ -18824,17 +18955,17 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -18845,7 +18976,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -18857,34 +18988,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 99328 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 99328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 51200 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 51200 + LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -18892,12 +19023,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 128 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -18906,14 +19037,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -18927,36 +19058,38 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 4 - NonTemporalC: 0 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerBatchStore: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -18966,7 +19099,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 79 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA2_NTB4_NTC0_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 @@ -18975,12 +19108,12 @@ StorePriorityOpt: 1 StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSyncOpt: 1 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 4 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -18988,10 +19121,10 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19005,28 +19138,31 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 8 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 1 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer _UseSgprForGRO: 1 _VectorStore: 1 @@ -19040,12 +19176,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19053,20 +19190,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x64x32_MI16xByhPKgVjq4CDyWj-cbHC-i4QzDiSS4uWjZj1qyr1xvg= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x64_MI16MWaFtORgD4eC8ES5PuaeS70bQdnSFwME3aBH6fKhp6A= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 - DirectToLdsA: false - DirectToLdsB: false + DepthU: 64 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: true DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -19084,7 +19221,7 @@ GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19096,34 +19233,34 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 + LdsBytesNoAmax: 152064 LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 152064 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 42240 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 76032 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 109824 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 25600 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 109824 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -19131,10 +19268,10 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: true + LocalWriteUseSgprB: true + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false MIArchVgpr: 0 MIBlock: [16, 16, 32, 1, 1, 1] @@ -19145,14 +19282,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 2] - MIWaveTileA: 5 - MIWaveTileB: 2 + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 64 - MacroTileA: 160 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19166,29 +19303,31 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: false + NoLdsWriteCode: true NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 3 - NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 4 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 40 - NumGlobalWriteVectorsPerThread: 40 - NumLoadsA: 5 - NumLoadsB: 2 + NumElementsPerBatchStore: 12 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 NumThreads: 256 + NumTotalPackedLoadsA: 8 + NumTotalPackedLoadsB: 10 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -19196,7 +19335,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19205,17 +19344,17 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 80 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_2_MO40_NTn1_NTA2_NTB3_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 0 StaggerUMapping: 0 StaggerUStride: 0 - StorePriorityOpt: 1 + StorePriorityOpt: 0 StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreSwapAddr: true + StoreSyncOpt: 4 + StoreVectorWidth: 4 StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 @@ -19227,16 +19366,16 @@ SuppressNoLoadLoop: false SwapGlobalReadOrder: false ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 2 - ThreadTileA: 20 - ThreadTileB: 2 - TransposeLDS: 1 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -19244,30 +19383,33 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: true + UseGeneralizedNLCOneB: true + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 2 + WorkGroupMappingXCC: 4 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: 0 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -19279,12 +19421,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: true - tailLoopOptB: true - - 1LDSBuffer: 0 + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 0 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19292,7 +19435,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xd-sQFx5RImj-B93gHPT59pdW-PfpHkRagGWfDrPaTnY= + BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xFyWqcs6KVFJkQ8kLhxkp503bjalLMrD-hQMHlzpWf2g= BufferLoad: true BufferStore: true CUCount: null @@ -19303,16 +19446,16 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: 0 + ExpandPointerSwap: true ExpertSchedulingMode: 0 ForceDisableShadowInit: false ForceUnrollSubIter: false @@ -19335,7 +19478,7 @@ SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 LDSTrInst: false LSCA: 64 LSCB: 64 @@ -19346,13 +19489,13 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 51200 LdsInitCVgprs: false - LdsNumBytes: 116224 + LdsNumBytes: 51200 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 @@ -19361,7 +19504,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 + LdsOffsetMetadata: 51200 LdsOffsetMetadata_Blk: 99328 LdsPadA: 8 LdsPadB: 8 @@ -19370,12 +19513,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: 1 MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19405,16 +19548,16 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 7 - NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 4 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 @@ -19428,13 +19571,15 @@ NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -19444,12 +19589,12 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 81 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA1_NTB7_NTC0_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 0 StaggerUMapping: 0 - StaggerUStride: 512 + StaggerUStride: 0 StorePriorityOpt: 0 StoreRemapVectorWidth: 0 StoreSwapAddr: false @@ -19458,7 +19603,7 @@ StreamK: 3 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 0 + StreamKXCCMapping: 8 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 @@ -19483,6 +19628,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false UseSgprForGRO: 1 @@ -19496,8 +19644,8 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 32 + WorkGroupMappingXCC: 2 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 0] @@ -19510,7 +19658,7 @@ _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -19518,12 +19666,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19531,18 +19680,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1-95uwA7thjYZN8Q0bg1wWhpCED9_VDGCoINGlVsybm0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3ZQt4emcWG8guHWikbv6OxFJy790l58gtvM3nfZjMIJE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -19553,16 +19702,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19571,50 +19720,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -19622,15 +19771,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -19650,21 +19799,21 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 7 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 @@ -19674,7 +19823,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19683,32 +19832,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 82 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC7_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 16 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 2 - ThreadTileA: 8 - ThreadTileB: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -19724,28 +19874,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -19759,10 +19909,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -19770,7 +19921,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32NrolHpaoNMGubHMXg2QZmfmFTUHLarzXy0RPUp34mNo= BufferLoad: true BufferStore: true CUCount: null @@ -19780,8 +19930,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -19792,16 +19942,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -19810,75 +19960,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 100352 + LdsBytesNoAmax: 34304 LdsInitCVgprs: false - LdsNumBytes: 100352 - LdsNumElementsAlignedA: 17408 + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 2] - MIWaveTileA: 2 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -19889,16 +20039,16 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -19913,7 +20063,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -19922,31 +20072,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 83 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA1_NTB1_NTC5_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -19963,32 +20114,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -20000,8 +20151,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20009,18 +20161,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x32_MI16x1R4sVsBWCeh56t4_hhONAiKYW1myOGPmVq0nXhjyJNZo= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3e4Dw_hz57yPEZN_qoLaorfGepQNCz75gt6VQs5_mgZo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 256 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -20031,15 +20183,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20049,50 +20201,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 14336 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 14336 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 5120 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 25600 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 14336 - LdsOffsetMetadata_Blk: 25600 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20100,15 +20252,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [1, 1] MIWaveTile: [2, 1] MIWaveTileA: 2 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20128,22 +20280,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20161,38 +20313,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 84 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA2_NTB0_NTC5_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 ThreadTile1: 1 ThreadTileA: 8 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -20212,17 +20365,17 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -20230,17 +20383,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20248,7 +20402,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1HgIojdE0sEyvw7DzcXO7orbn8GPiaT6KOVUT7VcaD98= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nB1C3iOYxU_7DUOjuOyODiZ9rVSorIjLm4u2U10uDaw= BufferLoad: true BufferStore: true CUCount: null @@ -20258,8 +20412,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -20270,16 +20424,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20288,50 +20442,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB2_NTC1_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 65024 + LdsBytesNoAmax: 50176 LdsInitCVgprs: false - LdsNumBytes: 65024 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: 0 + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20339,15 +20493,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 48 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 48 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20368,21 +20522,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20391,7 +20545,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20400,32 +20554,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 85 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB2_NTC1_NTD5_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20444,24 +20599,24 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -20476,10 +20631,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20487,20 +20643,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xRMVFeWautsW6BnxBD-9Gp9QjcP7DmKIwBt0NKkW_I0M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20509,15 +20664,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -20527,50 +20682,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB0_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20578,14 +20733,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -20600,28 +20755,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 + NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 1 - NonTemporalD: 3 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -20630,7 +20785,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20639,32 +20794,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 86 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA3_NTB0_NTC1_NTD3_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -20680,27 +20836,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -20708,17 +20864,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20726,18 +20883,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI16K1w2va1wKhvgZoTyL5zX8YLee7JqsF5V6wtSMY1I-GE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3SwF0bvQxB0RrhRxtPMVt3TyizF16j4vW99jq_X9KpHk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -20748,16 +20905,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -20766,37 +20923,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB6_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -20806,10 +20963,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -20817,15 +20974,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -20845,23 +21002,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -20869,7 +21026,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -20878,38 +21035,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 87 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB6_NTC4_NTD3_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -20919,45 +21077,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -20965,7 +21124,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI32x3HCPsg4TodlR6Q8cd0m5gwDltLOtlMg4yX28q5bkM3nc= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3YeRtOHUsp9ttyStfNKrKENY_vaUUjqgLCrwM91_2-aY= BufferLoad: true BufferStore: true CUCount: null @@ -20975,10 +21134,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -20987,15 +21146,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -21005,102 +21164,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 49408 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 49408 - LdsNumElementsAlignedA: 8320 - LdsNumElementsAlignedB: 8320 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 32768 - LdsOffsetB: 8320 - LdsOffsetB_Blk: 41088 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8320 - LdsOffsetMetadata_Blk: 41088 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [2, 1] MIWaveTile: [1, 1] MIWaveTileA: 1 MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 3 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -21117,38 +21276,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 88 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA1_NTB3_NTC5_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 4 ThreadTile1: 1 - ThreadTileA: 16 + ThreadTileA: 4 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21158,7 +21318,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -21168,17 +21328,17 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [32, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -21186,17 +21346,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21204,18 +21365,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x128x32_MI16ZwWemFSm3p4igCb8DLsnt6R0foWhVdW4rDKTnbvNCrM= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3cSFHPElhrbZabUpjPK0idMvlUk8E6jLU5EZAOZd0T7g= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -21226,16 +21387,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21244,37 +21405,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 117760 + LdsBytesNoAmax: 30208 LdsInitCVgprs: false - LdsNumBytes: 117760 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 100352 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21284,10 +21445,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21295,15 +21456,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [4, 8] - MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 128 - MacroTileA: 256 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21324,22 +21485,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 2 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -21347,7 +21508,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21356,38 +21517,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 89 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x128x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA0_NTB1_NTC5_NTD2_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 16 + StreamKXCCMapping: 0 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 8 - ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21397,28 +21559,28 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -21427,15 +21589,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 2 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21443,18 +21606,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT192x96x32_MI16xj2r8CbY8oh8svK2xXcWJE-9Rp3qvl1HSgXtzAYdPfBI= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1I2reK521Z0OJ8WcGapE1-BfPXpcGPiwVna1dA3-Ll3c= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 256 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -21465,16 +21628,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -21483,50 +21646,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA3_NTB1_NTC5_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 27648 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 27648 - LdsOffsetB_Blk: 93184 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 93184 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21534,15 +21697,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [6, 3] - MIWaveTileA: 6 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 192 - MacroTile1: 96 - MacroTileA: 192 - MacroTileB: 96 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21562,22 +21725,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 72 - NumGlobalWriteVectorsPerThread: 36 - NumLoadsA: 6 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 6 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21586,7 +21749,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -21595,36 +21758,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 90 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA3_NTB1_NTC5_NTD5_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 24 - ThreadTile1: 3 - ThreadTileA: 24 - ThreadTileB: 3 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -21636,45 +21800,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21682,7 +21847,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT256x256x32_MI1616MP9_S2iGj4HSyT5jYf0jksUCFwInC_pUDeGGq21_Y= BufferLoad: true BufferStore: true CUCount: null @@ -21693,7 +21857,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -21704,15 +21868,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -21722,37 +21886,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 139264 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 139264 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 69632 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 104448 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 104448 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21765,7 +21929,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -21774,14 +21938,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [8, 8] - MIWaveTileA: 8 - MIWaveTileB: 8 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 256 - MacroTile1: 256 - MacroTileA: 256 - MacroTileB: 256 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -21802,21 +21966,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 - NonTemporalC: 3 - NonTemporalD: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 256 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 32 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -21834,18 +21998,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 91 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB2_NTC3_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -21855,17 +22019,18 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 8 - ThreadTileA: 32 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -21875,7 +22040,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -21886,34 +22051,35 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 2 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -21921,20 +22087,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x128x64_MI16xro-D8veBeNmE7sMrxwoD_VnMFoW5lt0JJ6_H2vXgEsE= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -21943,15 +22108,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -21961,37 +22126,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 116224 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 116224 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16896 - LdsOffsetMetadata_Blk: 82432 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -21999,12 +22164,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22012,14 +22177,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -22034,28 +22199,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22064,7 +22229,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22073,36 +22238,37 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 92 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA2_NTB1_NTC1_NTD5_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -22114,45 +22280,46 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22160,7 +22327,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI1632V6Pxwe_725MnjTV9-vmbwgaM0uPWz-Wc0cWfNZEtw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ULSHV5TvKv-3s7N5kPB29fM38e7xZ96gnPlfW9sr_dw= BufferLoad: true BufferStore: true CUCount: null @@ -22171,7 +22338,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -22182,16 +22349,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22200,37 +22367,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 8 + LSPB: 8 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 13312 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -22243,7 +22410,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22251,15 +22418,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 4] - MIWaveTileA: 4 - MIWaveTileB: 4 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22281,21 +22448,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 7 - NonTemporalD: 6 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -22312,38 +22479,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 93 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC7_NTD6_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -22353,27 +22521,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 4 + VectorWidthA: 1 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -22381,8 +22549,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -22390,8 +22558,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22399,7 +22568,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x32_MI16q_1WHADry2cTxAoNmb44qR0nw6Q5GfHEK9JlilAlfio= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Gtebt9rQIpwe9oFGHoWcPG_t3BC0dLSpyLHC5C0Kq0M= BufferLoad: true BufferStore: true CUCount: null @@ -22410,7 +22579,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -22421,16 +22590,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22439,11 +22608,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -22452,24 +22621,24 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 43008 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 43008 - LdsNumElementsAlignedA: 25600 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 43008 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -22482,7 +22651,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22491,13 +22660,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 160 + MacroTileA: 128 MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -22519,20 +22688,20 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 3 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 5 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 @@ -22551,18 +22720,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 94 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB2_NTC0_NTD3_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC4_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -22572,15 +22741,16 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 20 + ThreadTileA: 16 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -22592,10 +22762,10 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -22603,17 +22773,17 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -22629,8 +22799,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22638,7 +22809,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32EK4Ew7js7-4eeoqDkT9c309xo8_wSmrv-mO16L7kjzE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6V3GWrV8q1KT6qcNF3NRQe6SaKzZRXLdh2CbWinGWLAU= BufferLoad: true BufferStore: true CUCount: null @@ -22648,8 +22819,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -22658,17 +22829,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -22678,75 +22849,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 34816 + LdsNumBytes: 26112 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedB: 8704 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 + LdsOffsetA_Blk: 32768 LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] + MIWaveGroup: [2, 1] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -22757,22 +22928,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 1 - NonTemporalC: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -22780,8 +22951,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -22790,31 +22961,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 95 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA3_NTB1_NTC6_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -22831,7 +23003,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -22841,17 +23013,17 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -22859,8 +23031,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -22868,8 +23040,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -22877,18 +23050,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1Tt22kH2e9CMY6k_Y8O_l57cXp8oRQ2KAhNqC8vG9-is= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zsxRwkMqBm0-RotUdPfZ2aFGnmO-YpXpopyCt-rV2s4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -22899,16 +23072,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -22917,50 +23090,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9728 + LdsBytesNoAmax: 67072 LdsInitCVgprs: false - LdsNumBytes: 9728 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9728 - LdsOffsetMetadata_Blk: 21504 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -22969,14 +23142,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -22996,23 +23169,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 4 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 5 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -23020,7 +23193,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23029,18 +23202,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 96 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA5_NTB4_NTC0_NTD5_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -23050,11 +23223,12 @@ SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23070,27 +23244,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [32, 4, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -23107,8 +23281,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23116,18 +23291,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x32x32_MI16x1tPRLVkTDR1Eck8xeuIldFLH6hKJV_G81Iz6F9AruisE= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HCXW7Md-kYsZ-1UnmrrC7uG9YBlO7z5sEs9KMCRnZRY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -23136,17 +23311,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -23156,37 +23331,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 9216 + LdsBytesNoAmax: 26624 LdsInitCVgprs: false - LdsNumBytes: 9216 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 4608 + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 20992 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 9216 - LdsOffsetMetadata_Blk: 20992 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -23196,10 +23371,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23207,14 +23382,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 2] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] MIWaveTileA: 2 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -23235,30 +23410,30 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 6 - NonTemporalB: 2 - NonTemporalC: 7 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 10 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularB: 2 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -23268,38 +23443,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 97 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA6_NTB2_NTC7_NTD7_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 2 + ThreadTile1: 1 ThreadTileA: 8 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -23313,23 +23489,23 @@ Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -23337,8 +23513,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -23346,8 +23522,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23355,18 +23532,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x32_MI16x1b-6QnEjVmiUVwlO7nIU1i3ohIg0WxzZTpmj7_cnGZsM= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68SIh_bnF1917_7343uVm45oe_0GQqSHLfLK0W_wi_u8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -23375,18 +23552,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23395,37 +23572,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB7_NTC6_NTD6_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 18944 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 18944 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 10240 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18944 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -23435,10 +23612,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23446,10 +23623,10 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] - MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 MacroTile1: 64 @@ -23475,21 +23652,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 6 - NonTemporalD: 6 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 + NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23497,8 +23674,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23507,32 +23684,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 98 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB7_NTC6_NTD6_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC16_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23551,24 +23729,24 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 16 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -23576,8 +23754,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -23585,8 +23763,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23594,7 +23773,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x32_MI16x1P_TUSJ5Myg8d1FJPwt0Wuh6SkhZpVj2Mg6_2M6eLfpQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6oabzLLQwOO03l3MEmNqlhJ6dLm9NnzDdPu-7gnuHCJQ= BufferLoad: true BufferStore: true CUCount: null @@ -23604,8 +23783,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -23614,18 +23793,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -23634,42 +23813,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA5_NTB2_NTC5_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 24576 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 24576 - LdsNumElementsAlignedA: 9216 - LdsNumElementsAlignedB: 15360 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 9216 - LdsOffsetB_Blk: 41984 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 24576 - LdsOffsetMetadata_Blk: 41984 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -23677,7 +23856,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23685,15 +23864,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23713,22 +23892,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 5 - NonTemporalB: 2 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -23736,7 +23915,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -23746,32 +23925,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 99 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA5_NTB2_NTC5_NTD7_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -23787,27 +23967,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -23815,17 +23995,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -23833,7 +24014,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI16xouCaO_kS3h9va9qBGG2ChOaMcrsz9moeqbtRNLm2voU= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6R02QOCgIPWslloJiox9BjhuIrZ3CRli_uOaWUztN7xk= BufferLoad: true BufferStore: true CUCount: null @@ -23843,10 +24024,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -23855,15 +24036,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: true + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -23873,50 +24054,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB5_NTC3_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 LSPA: 8 - LSPB: 32 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152576 + LdsBytesNoAmax: 99840 LdsInitCVgprs: false - LdsNumBytes: 152576 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 66560 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76288 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 84992 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 84992 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 1 - LoopUnroll: 32 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -23924,15 +24105,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 8] + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 512 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 512 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -23946,22 +24127,22 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 5 - NonTemporalC: 3 - NonTemporalD: 6 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 8 NumLoadsB: 16 NumLoadsCoalescedA: 1 @@ -23976,7 +24157,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -23985,38 +24166,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 100 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_8_MO40_NTn1_NTA1_NTB5_NTC3_NTD6_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 32 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 8 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -24026,7 +24208,7 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 @@ -24036,35 +24218,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false - numSubTiles: 2 + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24072,7 +24255,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x48x32_MI16x77c5vCqhF37wngAzGqfnFLKmA0iPzLeWaJiRYnuXPkU= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AxfD8GQK1funv0jxGwFu2nRz6aluGc9YL9jcdz-7fEk= BufferLoad: true BufferStore: true CUCount: null @@ -24082,8 +24265,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -24094,16 +24277,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24112,42 +24295,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x48x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 33280 + LdsBytesNoAmax: 131072 LdsInitCVgprs: false - LdsNumBytes: 33280 - LdsNumElementsAlignedA: 25600 - LdsNumElementsAlignedB: 7680 + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 25600 - LdsOffsetB_Blk: 91136 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 91136 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -24155,7 +24338,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24163,15 +24346,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 1] - MIWaveTile: [5, 3] - MIWaveTileA: 5 - MIWaveTileB: 3 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 48 - MacroTileA: 160 - MacroTileB: 48 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24192,22 +24375,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 60 - NumGlobalWriteVectorsPerThread: 60 - NumLoadsA: 10 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 3 - NumThreads: 128 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -24224,32 +24407,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 101 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x48x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT5_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 3 - ThreadTileA: 20 - ThreadTileB: 3 + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -24265,32 +24449,32 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 4, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -24302,8 +24486,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24311,7 +24496,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32Q9GNPZBj4w2q8J-ZJZlKjlie8yP-WUZUJeJmisgU0-g= BufferLoad: true BufferStore: true CUCount: null @@ -24321,8 +24505,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -24333,15 +24517,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -24351,75 +24535,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 99840 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 16896 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16896 - LdsOffsetB_Blk: 82432 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 82432 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [4, 1] + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -24430,22 +24614,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 1 - NonTemporalB: 1 - NonTemporalC: 5 - NonTemporalD: 7 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24463,38 +24647,39 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 102 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA1_NTB1_NTC5_NTD7_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 64 - ThreadTile1: 1 - ThreadTileA: 64 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -24504,27 +24689,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -24532,8 +24717,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -24541,8 +24726,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24550,7 +24736,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x448x32_MI16kicIlK4eREQNKFD_4qwLZk8EPghmo4TEg7fAcQxRmDY= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wZn1lj5-xh86AxV8XC_TfIrJlVoj_3cWBkWnmMVsin8= BufferLoad: true BufferStore: true CUCount: null @@ -24561,7 +24747,7 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -24570,17 +24756,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -24590,11 +24776,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -24604,13 +24790,13 @@ LVPA: 8 LVPB: 8 LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 89088 + LdsBytesNoAmax: 72704 LdsInitCVgprs: false - LdsNumBytes: 89088 + LdsNumBytes: 72704 LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 71680 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 131072 @@ -24619,7 +24805,7 @@ LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 89088 + LdsOffsetMetadata: 72704 LdsOffsetMetadata_Blk: 148480 LdsPadA: 8 LdsPadB: 8 @@ -24633,7 +24819,7 @@ LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -24642,14 +24828,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [8, 7] + MIWaveTile: [8, 6] MIWaveTileA: 8 - MIWaveTileB: 7 + MIWaveTileB: 6 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 448 + MacroTile1: 384 MacroTileA: 128 - MacroTileB: 448 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -24670,21 +24856,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 2 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 2 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 224 - NumGlobalWriteVectorsPerThread: 56 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 NumLoadsA: 4 - NumLoadsB: 14 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 14 + NumLoadsPerpendicularB: 12 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24692,7 +24878,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: @@ -24702,18 +24888,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 103 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x448x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_7_MO40_NTn1_NTA0_NTB2_NTC0_NTD2_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -24723,17 +24909,18 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 32 - ThreadTile1: 7 + ThreadTile1: 6 ThreadTileA: 32 - ThreadTileB: 7 - TransposeLDS: 2 + ThreadTileB: 6 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -24747,7 +24934,7 @@ Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -24758,12 +24945,12 @@ WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -24771,8 +24958,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -24780,8 +24967,9 @@ tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -24789,18 +24977,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x128x32_MI32WmWOeC1HZXF52s2qw5enQVIcphzfzkpjTKp5gB44mAw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65gE-MCZm4_zHo6C5uZDRUoKD8EXmAPJ3N4uk_Y8yfys= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -24811,16 +24999,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -24829,11 +25017,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -24842,26 +25030,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 35328 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 35328 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 83968 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 35328 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -24869,35 +25057,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 4] - MIWaveTileA: 1 - MIWaveTileB: 4 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -24908,22 +25096,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -24932,7 +25120,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -24941,18 +25129,18 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 104 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_4_MO40_NTn1_NTA4_NTB1_NTC1_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM48_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -24962,15 +25150,16 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 4 - ThreadTileA: 16 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -24982,27 +25171,27 @@ UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 48 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -25017,10 +25206,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25028,18 +25218,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x256x32_MI32xLh-ycjs1oSwZbXiz2hnStK9M-0nCgCFZS7lNjPV_78M= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6I8O4paswkmb66D5oBp-XVQKwsaElMfkk9UWM0xAgsVs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -25050,14 +25240,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -25067,11 +25258,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 LSPA: 32 @@ -25083,23 +25274,23 @@ LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 109056 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 109056 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 34816 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -25107,35 +25298,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] + MIWaveGroup: [2, 2] MIWaveTile: [2, 2] MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 256 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 256 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -25147,21 +25338,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 7 - NonTemporalC: 3 - NonTemporalD: 2 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25170,7 +25361,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -25179,31 +25370,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 105 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_2_MO40_NTn1_NTA0_NTB7_NTC3_NTD2_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 0 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 32 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -25214,12 +25406,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -25230,17 +25423,17 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -25249,14 +25442,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25264,7 +25459,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xB_DE9RVLGbG_PwgSFrYUPdM2k3sTJC_ftD4lxU9bY6g= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6eRXgyMefjCuWXVIY2LbceXYoQhTimXSfxsYDWpNFOp0= BufferLoad: true BufferStore: true CUCount: null @@ -25274,10 +25469,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -25286,15 +25481,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25303,51 +25499,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 150528 + LdsBytesNoAmax: 51712 LdsInitCVgprs: false - LdsNumBytes: 150528 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 66560 + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 75264 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 83968 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 83968 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -25355,49 +25551,49 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 - MIWaveTileB: 4 + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 512 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 512 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 6 - NonTemporalC: 4 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 2 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 8 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25415,32 +25611,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 106 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA2_NTB6_NTC4_NTD2_NTM0_NEPBS2_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 4 - ThreadTileA: 32 - ThreadTileB: 4 + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -25450,49 +25647,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 4 + VectorWidthA: 4 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25500,7 +25700,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x512x32_MI32xNPMvn-evBNCDHAzhSPgPljENZ_iAGw5fkD0ulDNYMWc= BufferLoad: true BufferStore: true CUCount: null @@ -25510,8 +25709,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -25522,15 +25721,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25539,75 +25739,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 76288 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 76288 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 76288 - LdsOffsetMetadata_Blk: 139776 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 4] - MIWaveTileA: 2 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 512 - MacroTileA: 64 - MacroTileB: 512 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -25618,22 +25818,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 5 - NonTemporalD: 3 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 128 - NumGlobalWriteVectorsPerThread: 64 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25642,7 +25842,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -25651,31 +25851,32 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 107 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x512x32_MI32x32x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA3_NTB6_NTC5_NTD3_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 256 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 + ThreadTile0: 16 ThreadTile1: 4 - ThreadTileA: 32 + ThreadTileA: 16 ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true @@ -25686,49 +25887,52 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 24 - WorkGroupMappingXCC: 2 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25736,7 +25940,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x384x32_MI32xJv85LAZjXvrheXhR6Pb2-1BpR8Zhns4NnJ21cIAl2Hw= BufferLoad: true BufferStore: true CUCount: null @@ -25746,8 +25949,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -25758,15 +25961,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -25775,39 +25979,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 129536 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 129536 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 55296 + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 74240 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8704 - LdsOffsetMetadata_Blk: 74240 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -25816,34 +26020,34 @@ LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopIters: 2 - LoopUnroll: 32 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 384 - MacroTileA: 64 - MacroTileB: 384 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -25855,21 +26059,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 96 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 2 - NumLoadsB: 12 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -25887,84 +26091,88 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 108 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x384x32_MI32x32x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC1_NTD1_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW2_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM48_WGMXCC32_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 128 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 2 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 32 - ThreadTile1: 3 - ThreadTileA: 32 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 32 + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -25972,7 +26180,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xW4r92vEXVihDdHo-lHvJ0uNEpIQK9BpFRUZ08nMbzCs= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT189iqYKY55VEcPzDOXv9ylhp1p78P2DCBr2S6_6nafkw= BufferLoad: true BufferStore: true CUCount: null @@ -25982,10 +26190,10 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -25994,15 +26202,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26011,50 +26220,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 + LdsBytesNoAmax: 79872 LdsInitCVgprs: false - LdsNumBytes: 115712 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26062,15 +26271,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26084,28 +26293,28 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 7 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 5 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26123,32 +26332,33 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 109 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA7_NTB1_NTC1_NTD5_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26158,7 +26368,8 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 @@ -26166,24 +26377,24 @@ UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -26191,16 +26402,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26208,18 +26421,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x16x128_MI16xwGhVm6bClKczrRVvyM1JDrgb5E0uKGXdH1TpIh1TL0o= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -26230,14 +26442,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -26247,42 +26460,42 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 17408 + LdsBytesNoAmax: 90624 LdsInitCVgprs: false - LdsNumBytes: 17408 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 67584 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 41472 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 41472 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 154112 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 2 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false @@ -26290,7 +26503,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26298,15 +26511,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26326,23 +26539,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 - NonTemporalB: 1 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 16 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26359,67 +26572,69 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 110 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x128_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA4_NTB1_NTC1_NTD1_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_2_WGM0_WGMXCC8_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 16 + SubGroup1: 64 SubGroupA: 4 - SubGroupB: 16 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 2] - WorkGroupMapping: 0 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -26429,14 +26644,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false - ActivationFuncCall: true + ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26444,19 +26661,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT80x16x32_MI16x1TWdBy3MnQQKMOSifSRrwSeieAjAz_Ipy8XM9t_apH8I= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false @@ -26466,15 +26682,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 2 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26483,37 +26700,3650 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 4 + LSPA: 32 + LSPB: 32 LVCA: 8 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 29504 + LdsBytesNoAmax: 87040 LdsInitCVgprs: false - LdsNumBytes: 29504 - LdsNumElementsAlignedA: 10560 - LdsNumElementsAlignedB: 2560 + LdsNumBytes: 87040 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 69632 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10560 - LdsOffsetB_Blk: 26944 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 87040 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128000 + LdsInitCVgprs: false + LdsNumBytes: 128000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1U7vOvdYFB8EejoI1DeZ2XXlfHVNEE5d0H12vbNNpATs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3OvWxntshUWW3_A-VI6XNSghru3U4UDSTDhjcjVR3HKY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT46N5m9gwCFO-Nzc9yENVU3SI1QrpCiOXEjeEPFSn86Hk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vaiUWgr6lZ68qXC_TGMU65523uOYKq3Ec6eraxi_h38= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1pFiexEy_nQA9jS434_CfI7aGjSUEPvYPE95_vzPQKTw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1H4SRQMHnBm8MPmn7CC4vovQuF9Klt0xNrGusZalzPig= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1rXTuKdoyxclHidecoClA0AfcqpENiqcNipr2eRQNhwg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9zD0PX1FPhAiuZDs_KjdPAn76aiPUEhRhVS2tkop5n3A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 30720 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 79360 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 79360 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19UJ9YnHb6cR5JwasXZkP1sk4AUOzy6Nd_GCne8pVOR0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wWhUNhsLXnwdUgsCNHLafWrE-J4dg946fN2_q4HV5w8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 + LdsInitCVgprs: false + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KkEUdHGIe6dKJ0zzMlcYFBbwtI1FwPuq4-b2hkE_RVc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65_iMlfIkk8B93jR0j9fItJcWD8qLSyBShDieS7L1wt0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31232 + LdsInitCVgprs: false + LdsNumBytes: 31232 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9gGJDLTTlViLZGJavc0sPMnxgvCGjSJIqu4gt9wnKCkU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10560 - LdsOffsetMetadata_Blk: 26944 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26521,12 +30351,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 1 - LoopUnroll: 32 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26534,15 +30364,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [5, 1] - MIWaveTileA: 5 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 80 - MacroTile1: 16 - MacroTileA: 80 - MacroTileB: 16 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26562,23 +30392,23 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 4 + NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 8 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 10 - NumLoadsB: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 4 - NumThreads: 64 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -26586,7 +30416,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 + PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -26594,33 +30424,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 111 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT80x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB2_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_1_MO40_NTn1_NTA4_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM7_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM0_WGMXCC32_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 7 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 1 - ThreadTileA: 20 - ThreadTileB: 1 + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -26630,12 +30461,13 @@ UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false - UseDot2F32XEmulation: true + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 @@ -26645,34 +30477,36 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 32 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false enableLDSTrB: false + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false + tailLoopOptA: true tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26680,7 +30514,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x7fd2kUwjm75D0kEujcLkV3_GTKT5wWnTcae2Fu7jWuw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45ulS7WcdI7UNW-ipBhI5_9g0NWEweK1v4iw5AdMdwyw= BufferLoad: true BufferStore: true CUCount: null @@ -26690,8 +30524,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -26702,15 +30536,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -26719,37 +30554,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26759,10 +30594,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -26770,15 +30605,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -26798,22 +30633,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 - NonTemporalC: 1 - NonTemporalD: 2 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 14 + NumElementsPerBatchStore: 6 NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -26822,7 +30657,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -26830,85 +30665,89 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 112 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC1_NTD2_NTM0_NEPBS14_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 - UseDot2F32XEmulation: true + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -26916,7 +30755,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x80x128_MI16xEh4DCgEFCt-afK9cUIQCk_5N_QizF04nbCdMOaWqSgw= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45Pf77PZ0y2a63oqBv5xuSEpvYfkv5rrfz2OggkgqUbU= BufferLoad: true BufferStore: true CUCount: null @@ -26926,8 +30765,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -26938,15 +30777,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -26956,37 +30795,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 156672 + LdsBytesNoAmax: 26112 LdsInitCVgprs: false - LdsNumBytes: 156672 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 43520 + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 78336 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 113152 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 113152 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -26996,10 +30835,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27007,15 +30846,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 5] - MIWaveTileA: 1 - MIWaveTileB: 5 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 80 - MacroTileA: 64 - MacroTileB: 80 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27036,21 +30875,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 2 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 20 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27059,7 +30898,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27067,33 +30906,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 113 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x80x128_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_5_MO40_NTn1_NTA0_NTB6_NTC2_NTD2_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true + StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 5 - ThreadTileA: 4 - ThreadTileB: 5 + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -27102,35 +30942,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -27148,6 +30988,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27155,7 +30996,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xIjYsJ7nPzSNt9gcCFNyYcDjxOwbmoQKbvftRgAY1G4I= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-PpbdGeEL52tGWSKuV1d3aERKjhvbI9dAbn_gKJooqY= BufferLoad: true BufferStore: true CUCount: null @@ -27165,8 +31006,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -27177,16 +31018,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27195,37 +31036,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 59904 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27235,10 +31076,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27246,15 +31087,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 320 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 320 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27274,22 +31115,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 2 - NonTemporalB: 5 - NonTemporalC: 1 - NonTemporalD: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27298,7 +31139,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27306,78 +31147,79 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 114 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA2_NTB5_NTC1_NTD1_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM6_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 6 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -27387,6 +31229,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27394,7 +31237,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16xG8s9vcohdFWaEPW_OKAxb_cYdQ6B4AQZ2nx9kjNOYto= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zKzQos5jBY-oZsuABvfgC-YVr0XQkeZ9FoaVzwSEWkA= BufferLoad: true BufferStore: true CUCount: null @@ -27404,8 +31247,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -27416,16 +31259,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthA: 2 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27434,37 +31277,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS7_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 48640 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 48640 + LdsNumElementsAlignedA: 17920 + LdsNumElementsAlignedB: 30720 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 83456 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 48640 + LdsOffsetMetadata_Blk: 83456 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27474,10 +31317,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27485,15 +31328,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 3] + MIWaveTileA: 7 MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 112 + MacroTile1: 192 + MacroTileA: 112 + MacroTileB: 192 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27513,22 +31356,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 6 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 1 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 6 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27537,7 +31380,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27545,32 +31388,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 115 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA3_NTB6_NTC0_NTD1_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT112x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS7_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 + ThreadTile0: 28 ThreadTile1: 3 - ThreadTileA: 8 + ThreadTileA: 28 ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true @@ -27580,35 +31424,35 @@ UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 + VectorWidthA: 1 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -27626,6 +31470,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27633,7 +31478,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x34z5jA9IIcOhaYTxrREvShaRXUEMAbqXkdSTqaoGUr0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3v1NwFnVXJTJCep_voPt7RyBBkZRSCMUMhO_N-SZ5gL8= BufferLoad: true BufferStore: true CUCount: null @@ -27643,8 +31488,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -27655,15 +31500,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -27673,37 +31518,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27713,10 +31558,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27724,15 +31569,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27753,21 +31598,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 6 - NonTemporalC: 2 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -27776,7 +31621,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -27784,70 +31629,71 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 116 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB6_NTC2_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM6_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 4 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 3 + ThreadTile1: 2 ThreadTileA: 8 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 6 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -27865,6 +31711,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -27872,7 +31719,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x96x128_MI16x9MB_BItjxFkx0YR1b98tLWirgDOvhTRoD-0flsTcQhA= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6C54sGtPjCJ3V-cPRAE3Ns8NHt0h_voSZTrp24ROARTw= BufferLoad: true BufferStore: true CUCount: null @@ -27882,8 +31729,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -27894,16 +31741,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -27912,37 +31759,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 86016 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 86016 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 52224 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 164864 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 86016 - LdsOffsetMetadata_Blk: 164864 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -27952,10 +31799,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -27963,15 +31810,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 3] - MIWaveTileA: 2 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 256 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -27992,21 +31839,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 1 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 12 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28015,7 +31862,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28023,78 +31870,79 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 117 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x128_MI16x16x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_3_MO40_NTn1_NTA0_NTB4_NTC1_NTD0_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS0_SU8_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM0_WGMXCC2_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 3 - ThreadTileA: 8 - ThreadTileB: 3 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 0 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -28104,6 +31952,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28111,7 +31960,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI32o1cp8AHeO1wh-BhDUt3-ptcU4Mtb1M0g3Gvr_oirH9Q= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6uAdk8COtu0uSDFSBZJBI7WxTlIvedN3IPIedLDwGas0= BufferLoad: true BufferStore: true CUCount: null @@ -28121,8 +31970,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -28131,18 +31980,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28151,39 +32000,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 135168 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 101376 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 262144 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 295936 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 135168 - LdsOffsetMetadata_Blk: 295936 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -28191,35 +32040,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 192 + MacroTile1: 256 MacroTileA: 64 - MacroTileB: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -28230,22 +32079,22 @@ NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 - NonTemporalA: 3 - NonTemporalB: 5 + NonTemporalA: 0 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 - NumLoadsA: 8 - NumLoadsB: 24 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 24 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28253,8 +32102,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28262,19 +32111,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 118 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA3_NTB5_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS0_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM48_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -28284,48 +32133,49 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 3 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 3 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 48 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -28343,6 +32193,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28350,18 +32201,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x192x128_MI323I4N4iBo8DW3RIRd0BHqSQeGBeqa7daq5LoZFfvEd3I= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 128 - DirectToLds: 0 + DepthU: 64 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -28370,18 +32220,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28390,39 +32240,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 135168 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 135168 + LdsNumBytes: 67584 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 101376 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 262144 + LdsOffsetA_Blk: 131072 LdsOffsetB: 33792 - LdsOffsetB_Blk: 295936 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 135168 - LdsOffsetMetadata_Blk: 295936 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -28430,11 +32280,11 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 8 - LoopUnroll: 128 + LoopIters: 2 + LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -28442,23 +32292,23 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 192 - MacroTileA: 64 - MacroTileB: 192 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -28470,21 +32320,21 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 2 - NonTemporalD: 3 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 48 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 24 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 24 + NumLoadsPerpendicularB: 8 NumThreads: 256 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -28492,7 +32342,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -28501,70 +32351,71 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 119 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x192x128_MI32x32x1_CMS_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB4_NTC2_NTD3_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC16_WGMXCCGn1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 512 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 3 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 3 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 - UseCustomMainLoopSchedule: 1 + UseCustomMainLoopSchedule: false UseDirect32XEmulation: true UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 128 - _DepthUA: 128 - _DepthUB: 128 - _DepthUMetadata: 128 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -28578,10 +32429,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28589,7 +32441,6 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1Fd3Wgq88OZ1PaojzfzjpDsCp8WKZiHDJo9GXAXrgRK0= BufferLoad: true BufferStore: true CUCount: null @@ -28599,8 +32450,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -28611,16 +32462,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28629,37 +32480,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59904 + LdsBytesNoAmax: 94720 LdsInitCVgprs: false - LdsNumBytes: 59904 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 8704 + LdsNumBytes: 94720 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 71680 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 18432 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 94720 + LdsOffsetMetadata_Blk: 154112 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -28669,10 +32520,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -28680,15 +32531,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 7] + MIWaveTileA: 10 + MIWaveTileB: 7 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 160 + MacroTile1: 448 + MacroTileA: 160 + MacroTileB: 448 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -28710,23 +32561,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 10 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 14 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 14 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -28734,7 +32583,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -28742,39 +32591,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 120 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 40 + ThreadTile1: 7 + ThreadTileA: 40 + ThreadTileB: 7 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -28782,32 +32632,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -28826,6 +32673,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -28833,18 +32681,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT16x64x64_MI16x1oxplz8DgKAzzHI9atKMOCqO_fHvi4aY6eI_59fmy5x0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hKaBSm-LDSAQofLbqhwHQZEeIsXzaeyVIdwWTVQFEe8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -28855,16 +32703,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -28873,11 +32721,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -28886,24 +32734,24 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 23040 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 23040 - LdsNumElementsAlignedA: 4608 - LdsNumElementsAlignedB: 18432 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 4608 - LdsOffsetB_Blk: 37376 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 23040 - LdsOffsetMetadata_Blk: 37376 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -28916,7 +32764,7 @@ LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -28924,14 +32772,14 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -28955,22 +32803,20 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 NumElementsPerBatchStore: 8 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -28986,33 +32832,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 121 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 1 - ThreadTileA: 4 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29026,32 +32873,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -29070,6 +32914,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29077,7 +32922,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x48x64_MI16x1Y2_PBeJGjEXXPI1_8Q1nplFuMWjj1kVwTjC8QiIVYG0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6sFfDRoimNH8imnTTvcLS2cE_cU4dhe0vq9JydCcQHaY= BufferLoad: true BufferStore: true CUCount: null @@ -29087,8 +32932,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -29099,16 +32944,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29117,37 +32962,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 32256 + LdsBytesNoAmax: 34816 LdsInitCVgprs: false - LdsNumBytes: 32256 - LdsNumElementsAlignedA: 18432 - LdsNumElementsAlignedB: 13824 + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 25600 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 18432 - LdsOffsetB_Blk: 51200 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 32256 - LdsOffsetMetadata_Blk: 51200 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 74752 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29157,10 +33002,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29168,15 +33013,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 3] - MIWaveTileA: 1 - MIWaveTileB: 3 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 48 + MacroTile1: 160 MacroTileA: 64 - MacroTileB: 48 + MacroTileB: 160 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29203,18 +33048,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 12 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -29222,7 +33065,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29230,33 +33073,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 122 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 4 - ThreadTile1: 3 - ThreadTileA: 4 - ThreadTileB: 3 + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -29270,32 +33114,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 2 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -29310,10 +33151,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29321,20 +33163,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x96x64_MI16x6H73GL1jU92BhYVu1-N3hGNc1zsprzx_b60KuCSUZ8M= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -29343,16 +33184,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29361,37 +33202,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 124672 + LdsBytesNoAmax: 22528 LdsInitCVgprs: false - LdsNumBytes: 124672 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 25344 + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 17408 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 37888 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29399,12 +33240,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29413,14 +33254,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] - MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29434,31 +33275,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 8 - NumLoadsB: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 6 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -29466,7 +33305,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29474,39 +33313,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 123 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 3 - ThreadTileA: 16 - ThreadTileB: 3 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -29514,16 +33354,13 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 @@ -29531,16 +33368,16 @@ WavefrontSize: 64 WorkGroup: [32, 8, 1] WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -29552,12 +33389,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29565,20 +33403,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x64_MI161e0FqGLAMztP_dD6xxGdXRut9vwmQsZsotL-hvEenUg= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1EcUjFXB929D0f2TV76fVey2j0aZunr_I-f76y9iEKD8= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -29587,16 +33425,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29605,37 +33443,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152064 + LdsBytesNoAmax: 23040 LdsInitCVgprs: false - LdsNumBytes: 152064 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 42240 + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76032 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 109824 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 109824 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -29643,12 +33481,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -29656,15 +33494,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] - MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 160 + MacroTile1: 32 MacroTileA: 128 - MacroTileB: 160 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -29678,31 +33516,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 - NumLoadsA: 8 - NumLoadsB: 10 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 10 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -29710,7 +33546,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29718,39 +33554,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 124 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS256_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM4_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 - StoreVectorWidth: 4 - StreamK: 3 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 5 - ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -29758,50 +33595,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -29809,18 +33644,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT320x192x32_MI328ZzXv-gq9j9I9S9bJzI6udNnruDK5iaoddi5jgQlSzk= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 32 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -29831,16 +33665,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -29849,11 +33683,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 LSCA: 32 LSCB: 32 LSPA: 32 @@ -29862,26 +33696,26 @@ LVCB: 8 LVPA: 8 LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 147456 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 147456 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 73728 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 119808 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 46080 - LdsOffsetMetadata_Blk: 119808 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -29889,35 +33723,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 + LoopIters: 1 LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [5, 3] - MIWaveTileA: 5 - MIWaveTileB: 3 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 320 - MacroTile1: 192 - MacroTileA: 320 - MacroTileB: 192 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -29930,23 +33764,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 240 - NumGlobalWriteVectorsPerThread: 240 - NumLoadsA: 10 - NumLoadsB: 6 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -29954,7 +33786,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -29962,19 +33794,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 125 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT320x192x32_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_3_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 128 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -29984,15 +33816,16 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false - ThreadTile: [1, 1] - ThreadTile0: 80 - ThreadTile1: 3 - ThreadTileA: 80 - ThreadTileB: 3 + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -30002,32 +33835,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 32 _DepthUA: 32 _DepthUB: 32 _DepthUMetadata: 32 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -30035,8 +33865,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -30046,6 +33876,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30053,7 +33884,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT160x128x64_MI16HbQjL6Xiy4_bwYKGnfEzEBO73K7v6GCjIcyS997kha4= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6mLBE_Dn-_3QTzaoio4b0fzaYzT15iyos9J3lxNRut4U= BufferLoad: true BufferStore: true CUCount: null @@ -30063,8 +33894,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -30075,16 +33906,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30093,37 +33924,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 79872 + LdsBytesNoAmax: 24576 LdsInitCVgprs: false - LdsNumBytes: 79872 - LdsNumElementsAlignedA: 46080 - LdsNumElementsAlignedB: 33792 + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 46080 - LdsOffsetB_Blk: 177152 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 79872 - LdsOffsetMetadata_Blk: 177152 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -30133,10 +33964,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -30145,14 +33976,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [5, 4] - MIWaveTileA: 5 - MIWaveTileB: 4 + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 160 - MacroTile1: 128 - MacroTileA: 160 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -30179,18 +34010,16 @@ NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 80 - NumLoadsA: 10 - NumLoadsB: 8 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 10 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30198,7 +34027,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30206,37 +34035,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 126 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW1_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC1_WGMXCCGn1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 4 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 20 - ThreadTile1: 4 - ThreadTileA: 20 - ThreadTileB: 4 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -30246,32 +34076,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 4 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 1 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -30286,10 +34113,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30297,20 +34125,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3vs1Q68Ny_keXqIMp2Wsj6oT9pYijob5fQFPcck7RhE0= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3UA89VtDjVMYF_nVEcLHiT6ehHteA9v56y_f3v5d_CYI= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -30319,16 +34147,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30337,95 +34165,95 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -30433,8 +34261,6 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30442,7 +34268,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30450,37 +34276,38 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 127 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 SubGroup0: 4 - SubGroup1: 64 + SubGroup1: 16 SubGroupA: 4 - SubGroupB: 64 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -30490,50 +34317,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30541,20 +34366,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3DYvE-qV_7GGecMY1_JFL64PBZq2Vbkv4V5zJot8g6OQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -30563,16 +34387,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - ForceUnrollSubIter: false + ForceDisableShadowInit: 1 + ForceUnrollSubIter: true GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30581,51 +34405,51 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 128000 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 128000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 @@ -30633,52 +34457,50 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30686,7 +34508,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30694,33 +34516,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 128 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16_WGMXCC1_WGMXCCGn1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 1 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 64 - SubGroupA: 4 - SubGroupB: 64 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -30734,33 +34557,30 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -30769,15 +34589,16 @@ enableGLTrB: false enableLDSTrA: false enableLDSTrB: false - numSubTiles: 1 + numSubTiles: 2 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -30785,18 +34606,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x32x64_MI16x1OBoPYQkKxqQdcTYaOp3l8VgmoDSmZ_j5vyKbG-QUUJg= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6Q86Al2n33vUP8nQtrYZlZYGsrzmmIC_0MPgq_9eYXoc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: 0 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -30807,16 +34628,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -30825,11 +34646,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 LSCA: 64 LSCB: 64 LSPA: 16 @@ -30838,37 +34659,37 @@ LVCB: 16 LVPA: 4 LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 59392 + LdsBytesNoAmax: 65536 LdsInitCVgprs: false - LdsNumBytes: 59392 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 9216 + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 32768 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 50176 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 17408 - LdsOffsetMetadata_Blk: 50176 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 2 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -30876,15 +34697,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -30906,23 +34727,21 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -30930,7 +34749,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -30938,39 +34757,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 129 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 2 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 32 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -30978,32 +34798,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 2 - VectorWidthB: 1 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -31011,17 +34828,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31029,20 +34847,19 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3EUpHQs_hYzAB34YWQur8CNtLVfZKv6w_F6Q0El8csoA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -31051,16 +34868,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31069,104 +34886,102 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 100864 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 100864 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 92160 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 100864 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 MIWaveTileMetadata: 0 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 576 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 576 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 2 + NumLoadsB: 18 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 18 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31174,7 +34989,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31182,19 +34997,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 130 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC32_WGMXCCGn1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -31204,15 +35019,16 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 1 + ThreadTile1: 9 ThreadTileA: 16 - ThreadTileB: 1 + ThreadTileB: 9 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 + UnrollLoopSwapGlobalReadOrder: 1 UnrollMajorLDSA: true UnrollMajorLDSB: true UnrollMajorLDSMetadata: true @@ -31222,50 +35038,48 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 4 - WorkGroupMappingXCC: 32 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31273,18 +35087,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x96x64_MI16x9ICG4gPhzi9OmT20CRMj-3n344OmlrV6OXzawY3nTvU= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hbz-dyf7FwiPWJ4x45PK0ZJquMKV4n_z15-8SqAfiXQ= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -31295,15 +35109,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -31313,50 +35127,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 61440 + LdsBytesNoAmax: 66560 LdsInitCVgprs: false - LdsNumBytes: 61440 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 27648 + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 61440 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31364,15 +35178,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 3] + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 3 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31393,24 +35207,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 8 - NumLoadsB: 6 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 6 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31418,7 +35230,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31426,33 +35238,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 131 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x96x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_3_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU8_SUM0_SUS512_SPO1_SRVW0_SSO0_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 8 + StaggerU: 16 StaggerUMapping: 0 StaggerUStride: 512 - StorePriorityOpt: 1 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 3 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 3 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -31466,37 +35279,34 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 + _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false enableLDSTrA: false @@ -31510,6 +35320,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31517,18 +35328,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16x0cYNusgHgF3Co_ShFYNVgf_9fASJQBYDTRah_BV_zEY= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 128 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -31537,18 +35347,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -31557,50 +35367,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 43008 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31608,15 +35418,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 - MIWaveTileB: 2 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31638,31 +35448,29 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -31670,33 +35478,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 132 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM4_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC1_WGMXCCGn1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 512 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 4 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -31710,32 +35519,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 - VectorWidthB: 2 + VectorWidthA: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -31743,17 +35549,18 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -31761,7 +35568,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x160x64_MI16MWaFtORgD4eC8ES5PuaeS70bQdnSFwME3aBH6fKhp6A= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xL-j8Nchxvh82x3gA1ZugFs4j6lELgVzV2fA-hmOsN8= BufferLoad: true BufferStore: true CUCount: null @@ -31772,9 +35579,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -31783,15 +35590,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true @@ -31801,11 +35608,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -31817,21 +35624,21 @@ LdsBlockSizePerPadA: 1024 LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 152064 + LdsBytesNoAmax: 67584 LdsInitCVgprs: false - LdsNumBytes: 152064 + LdsNumBytes: 67584 LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 42240 + LdsNumElementsAlignedB: 33792 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 76032 + LdsOffsetA_Blk: 131072 LdsOffsetB: 33792 - LdsOffsetB_Blk: 109824 + LdsOffsetB_Blk: 164864 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33792 - LdsOffsetMetadata_Blk: 109824 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -31839,12 +35646,12 @@ LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 0 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -31853,14 +35660,14 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 5] + MIWaveTile: [4, 4] MIWaveTileA: 4 - MIWaveTileB: 5 + MIWaveTileB: 4 MIWaveTileMetadata: 0 MacroTile0: 128 - MacroTile1: 160 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 160 + MacroTileB: 128 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -31874,31 +35681,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 12 - NumElementsPerThread: 80 - NumGlobalWriteVectorsPerThread: 20 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 8 - NumLoadsB: 10 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 10 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: 8 - NumTotalPackedLoadsB: 10 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -31914,19 +35719,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 133 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS12_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO4_SVW4_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM2_WGMXCC4_WGMXCCGn1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 - StoreSwapAddr: true - StoreSyncOpt: 4 + StoreSwapAddr: false + StoreSyncOpt: 0 StoreVectorWidth: 4 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -31936,17 +35741,18 @@ SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 5 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 5 - TransposeLDS: 2 + ThreadTileB: 4 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -31954,33 +35760,30 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 4 - VectorWidthB: 1 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 2 - WorkGroupMappingXCC: 4 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -31992,12 +35795,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32005,7 +35809,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI32xuZLNd47w3L6zFk9Cg2oSluavIaHhVBvshWxewFaNb8o= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT67lNyUdTmw-cOQZDz4eeTMX4-DhVCJMsSaMIubs8z554= BufferLoad: true BufferStore: true CUCount: null @@ -32015,8 +35819,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -32025,18 +35829,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32045,39 +35849,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 LdsBlockSizePerPadB: 512 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51712 + LdsBytesNoAmax: 43520 LdsInitCVgprs: false - LdsNumBytes: 51712 - LdsNumElementsAlignedA: 34816 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 34816 - LdsOffsetB_Blk: 100352 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51712 - LdsOffsetMetadata_Blk: 100352 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -32085,35 +35889,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 2] - MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 MIWaveTileMetadata: 0 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -32126,31 +35930,29 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32158,33 +35960,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 134 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG128_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 16 - ThreadTileB: 2 + ThreadTileB: 4 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32198,32 +36001,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthA: 4 + VectorWidthB: 4 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [128, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -32238,10 +36038,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32249,20 +36050,20 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x16x256_MI16xWU2YXZSq42Zyg4I1M1bqQl8lWMhuX8bgcxNNF-WMw3U= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6eM9UlY0usrKvssU3Ar5zZiiJpm6PCupFKVpS-ZFqsX0= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 256 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -32271,15 +36072,15 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true @@ -32289,50 +36090,50 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 - LDSTrInst: false - LSCA: 256 - LSCB: 256 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 2048 - LdsBlockSizePerPadB: 1024 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 115712 + LdsBytesNoAmax: 24576 LdsInitCVgprs: false - LdsNumBytes: 115712 - LdsNumElementsAlignedA: 33280 - LdsNumElementsAlignedB: 16896 + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33280 - LdsOffsetB_Blk: 98816 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 33280 - LdsOffsetMetadata_Blk: 98816 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 4 + LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 2 - LoopUnroll: 64 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -32340,15 +36141,15 @@ MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [2, 1] + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] MIWaveTileA: 2 - MIWaveTileB: 1 + MIWaveTileB: 3 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 @@ -32362,31 +36163,29 @@ MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 - NonTemporalD: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32394,7 +36193,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -32402,33 +36201,34 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 135 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT2_1_MO40_NTn1_NTA0_NTB4_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM16_WGMXCC16_WGMXCCGn1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 2 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 8 - ThreadTile1: 1 + ThreadTile1: 3 ThreadTileA: 8 - ThreadTileB: 1 + ThreadTileB: 3 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -32442,12 +36242,9 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 2 @@ -32457,17 +36254,17 @@ WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 - WorkGroupMappingXCC: 16 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 256 - _DepthUA: 256 - _DepthUB: 256 - _DepthUMetadata: 256 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -32480,12 +36277,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32493,7 +36291,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT32x256x64_MI32xPv1b-quWnWZzrEFikvQdugHi9IL2VdsODF63jr62UQ4= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ByCgfye7rWIPQ1YJohmIQ0lNCn8NrEKn7vXmxjyaZFM= BufferLoad: true BufferStore: true CUCount: null @@ -32503,8 +36301,8 @@ ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 256 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -32513,17 +36311,17 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true @@ -32533,75 +36331,75 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 76288 + LdsBytesNoAmax: 33792 LdsInitCVgprs: false - LdsNumBytes: 76288 - LdsNumElementsAlignedA: 8704 - LdsNumElementsAlignedB: 67584 + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 131072 - LdsOffsetB: 8704 - LdsOffsetB_Blk: 139776 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 76288 - LdsOffsetMetadata_Blk: 139776 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 - LocalSplitU: 1 + LocalSplitU: 4 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 2] + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] MIWaveTileA: 1 - MIWaveTileB: 2 + MIWaveTileB: 1 MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -32613,31 +36411,29 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 + NonTemporalB: 0 NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 16 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 1 PreloadKernArgs: true SFCWGM: @@ -32646,39 +36442,40 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 136 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT32x256x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_2_MO40_NTn1_NTA0_NTB4_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 1024 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 StoreVectorWidth: 1 - StreamK: 3 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 128 - SubGroupA: 2 - SubGroupB: 128 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 2 - ThreadTileA: 16 - ThreadTileB: 2 - TransposeLDS: 2 + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseCustomMainLoopSchedule: false @@ -32686,32 +36483,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 VectorWidthA: 1 - VectorWidthB: 2 + VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -32726,10 +36520,11 @@ reorderGRInstForDTVB: false tailLoopOptA: true tailLoopOptB: true - - 1LDSBuffer: 0 + - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32737,7 +36532,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3Rdlq7Rc2vP_yhCpcNjpQdCrFtftCCMe2J1b1mDh_IUI= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6et-9-ykpgaai9G-L1VaR5RDg1xNbnI7j11JfZuQQm7o= BufferLoad: true BufferStore: true CUCount: null @@ -32748,9 +36543,9 @@ CustomKernelName: '' DebugStreamK: 0 DepthU: 64 - DirectToLds: true - DirectToLdsA: true - DirectToLdsB: true + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false DirectToVgprA: false DirectToVgprB: false DirectToVgprSparseMetadata: false @@ -32759,16 +36554,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -32777,11 +36572,11 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 LSCA: 64 LSCB: 64 LSPA: 16 @@ -32791,46 +36586,46 @@ LVPA: 4 LVPB: 4 LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 98816 + LdsBytesNoAmax: 35328 LdsInitCVgprs: false - LdsNumBytes: 98816 - LdsNumElementsAlignedA: 16640 - LdsNumElementsAlignedB: 16640 + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 65536 - LdsOffsetB: 16640 - LdsOffsetB_Blk: 82176 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 16640 - LdsOffsetMetadata_Blk: 82176 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 LocalSplitUReuseLDS: 1 LocalWritePerMfma: -1 - LocalWriteUseSgprA: true - LocalWriteUseSgprB: true - LoopIters: 4 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 LoopUnroll: 64 MFMA_BF16_1K: false - MIArchVgpr: 1 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 MIWaveTileB: 1 MIWaveTileMetadata: 0 MacroTile0: 64 @@ -32842,30 +36637,30 @@ MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 MfmaInitCVgprs: false - NoLdsWriteCode: true + NoLdsWriteCode: false NoReject: false NoTailLoop: false NonDTLTailLoopA: false NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 + NonTemporalB: 0 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 + NumElementsPerBatchStore: 8 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -32873,8 +36668,6 @@ NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 NumThreads: 256 - NumTotalPackedLoadsA: 4 - NumTotalPackedLoadsB: 4 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -32890,19 +36683,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 137 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD4_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM4_WGMXCC16_WGMXCCGn1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 256 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 4 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -32912,6 +36705,7 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] ThreadTile0: 16 ThreadTile1: 1 @@ -32930,33 +36724,30 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: true - UseGeneralizedNLCOneB: true - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 0 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 + VectorWidthA: 4 VectorWidthB: 1 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 4 - WorkGroupMappingXCC: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] + WorkspaceCheck: [4, 0, -1] _DepthU: 64 _DepthUA: 64 _DepthUB: 64 _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer - _UseSgprForGRO: 0 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -32968,12 +36759,13 @@ numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false + tailLoopOptA: true + tailLoopOptB: true - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -32981,18 +36773,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT64x64x64_MI32x3CZS8qX8a5jcy85YI3uVSITdEfZd1y3IMs4eLTwn3ELQ= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZglzLcUOj7D9-4HA3xQ6qm78zoBYVUYUrp0ST-4N2Tc= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -33003,16 +36795,16 @@ EnableMatrixInstruction: true ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -33021,39 +36813,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 34816 + LdsBytesNoAmax: 78336 LdsInitCVgprs: false - LdsNumBytes: 34816 - LdsNumElementsAlignedA: 17408 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 78336 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 55296 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 17408 - LdsOffsetB_Blk: 82944 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 34816 - LdsOffsetMetadata_Blk: 82944 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetMetadata: 78336 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 8 + LdsPadB: 8 LdsPadMetadata: 0 LocalReadVectorWidth: 4 LocalSplitU: 1 @@ -33061,35 +36853,35 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 0 - MIBlock: [32, 32, 16, 1, 1, 1] + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 MIInputPerThreadB: 8 MIInputPerThreadMetadata: 8 MIOutputVectorWidth: 4 MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 160 + MacroTile1: 384 + MacroTileA: 160 + MacroTileB: 384 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 MatrixInstB: 1 MatrixInstBM: 1 MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 32 - MatrixInstN: 32 - MatrixInstruction: [32, 32, 16, 1] + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] MaxLDS: 163840 MaxOccupancy: 40 MbskPrefetchMethod: 0 @@ -33101,24 +36893,22 @@ NonDTLTailLoopB: false NonTemporal: -1 NonTemporalA: 0 - NonTemporalB: 4 - NonTemporalC: 4 - NonTemporalD: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 16 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 5 + NumLoadsB: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 12 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] @@ -33126,7 +36916,7 @@ PackedC1IdxChars: [J] PackedC1IndicesX: [1] PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -33134,19 +36924,19 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 138 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x64_MI32x32x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA4_LPB4_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB4_NTC4_NTD0_NTM0_NEPBS16_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO1_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM2_WGMXCC8_WGMXCCGn1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 1 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 3 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 StreamKXCCMapping: 0 @@ -33156,11 +36946,12 @@ SubGroupB: 64 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 @@ -33174,32 +36965,29 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 + VectorWidthA: 2 + VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 WaveSeparateGlobalReadMetadata: 0 WaveSplitK: false WavefrontSize: 64 - WorkGroup: [64, 4, 1] - WorkGroupMapping: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -33218,6 +37006,7 @@ ActivationAlt: false ActivationFuncCall: false ActivationFused: true + AdaptiveGemm: 1 AssertAIGreaterThanEqual: -1 AssertAILessThanEqual: -1 AssertFree0ElementMultiple: 1 @@ -33225,18 +37014,18 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_S_MX_B_BiasS_HAS_SAV_UserArgs_MT128x64x64_MI16xFyWqcs6KVFJkQ8kLhxkp503bjalLMrD-hQMHlzpWf2g= + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HKFsR1xpqIqB8iGFlETrmdcYqlXRxk0kl_LfZapGERo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: 4 ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 64 - DirectToLds: 0 + DepthU: 32 + DirectToLds: false DirectToLdsA: false DirectToLdsB: false DirectToVgprA: false @@ -33245,18 +37034,18 @@ EdgeType: ShiftPtr EnableF32XdlMathOp: true EnableMatrixInstruction: true - ExpandPointerSwap: true + ExpandPointerSwap: 0 ExpertSchedulingMode: 0 - ForceDisableShadowInit: false + ForceDisableShadowInit: 1 ForceUnrollSubIter: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 - GlobalSplitU: 0 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 GroupLoadStore: false GuaranteeNoPartialA: true GuaranteeNoPartialB: true @@ -33265,37 +37054,37 @@ InnerUnroll: 1 InterleaveAlpha: 0 InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true} + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 51200 + LdsBytesNoAmax: 18432 LdsInitCVgprs: false - LdsNumBytes: 51200 - LdsNumElementsAlignedA: 33792 - LdsNumElementsAlignedB: 17408 + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 65536 - LdsOffsetB: 33792 - LdsOffsetB_Blk: 99328 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 51200 - LdsOffsetMetadata_Blk: 99328 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 LdsPadA: 8 LdsPadB: 8 LdsPadMetadata: 0 @@ -33305,10 +37094,10 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 64 + LoopIters: 1 + LoopUnroll: 32 MFMA_BF16_1K: false - MIArchVgpr: 1 + MIArchVgpr: false MIBlock: [16, 16, 32, 1, 1, 1] MIInputPerThread: 8 MIInputPerThreadA: 8 @@ -33317,13 +37106,13 @@ MIOutputVectorWidth: 4 MIRegPerOut: 1 MIWaveGroup: [2, 2] - MIWaveTile: [4, 2] - MIWaveTileA: 4 + MIWaveTile: [2, 2] + MIWaveTileA: 2 MIWaveTileB: 2 MIWaveTileMetadata: 0 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MagicDivAlg: 2 MathClocksUnrolledLoop: 0 @@ -33346,31 +37135,29 @@ NonTemporal: -1 NonTemporalA: 0 NonTemporalB: 0 - NonTemporalC: 4 + NonTemporalC: 0 NonTemporalD: 4 NonTemporalE: 0 NonTemporalMetadata: 0 NonTemporalWS: 0 - NumElementsPerBatchStore: 4 - NumElementsPerThread: 32 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 - NumTotalPackedLoadsA: -1 - NumTotalPackedLoadsB: -1 NumWaveSplitK: 1 OptNoLoadLoop: 1 PackedC0IdxChars: [I] PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 PreloadKernArgs: true SFCWGM: - [1, 1] @@ -33378,32 +37165,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 139 - SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS1_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV1_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC4_NTD4_NTM0_NEPBS4_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS0_SPO0_SRVW0_SSO1_SVW4_SK3_SKFTR0_SKXCCM8_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM32_WGMXCC2_WGMXCCGn1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 SourceSwap: 1 SpaceFillingAlgo: [] - StaggerU: 0 + StaggerU: 16 StaggerUMapping: 0 - StaggerUStride: 0 - StorePriorityOpt: 0 + StaggerUStride: 128 + StorePriorityOpt: false StoreRemapVectorWidth: 0 StoreSwapAddr: false - StoreSyncOpt: 1 - StoreVectorWidth: 4 - StreamK: 3 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 StreamKAtomic: 0 StreamKFixupTreeReduction: 0 - StreamKXCCMapping: 8 + StreamKXCCMapping: 0 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 SuppressNoLoadLoop: false SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 ThreadTile: [1, 1] - ThreadTile0: 16 + ThreadTile0: 8 ThreadTile1: 2 - ThreadTileA: 16 + ThreadTileA: 8 ThreadTileB: 2 TransposeLDS: 1 TransposeLDSMetadata: true @@ -33418,15 +37206,12 @@ UseDot2F32XEmulation: false UseDotInstruction: false UseF32XEmulation: true - UseGeneralizedNLCOneA: false - UseGeneralizedNLCOneB: false - UseGeneralizedNLCOneMetadata: false UseInstOffsetForGRO: 0 UsePLRPack: false - UseSgprForGRO: 1 + UseSgprForGRO: -1 Valid: true VectorStore: -1 - VectorWidthA: 4 + VectorWidthA: 2 VectorWidthB: 2 WaveSeparateGlobalReadA: 0 WaveSeparateGlobalReadB: 0 @@ -33434,16 +37219,16 @@ WaveSplitK: false WavefrontSize: 64 WorkGroup: [32, 8, 1] - WorkGroupMapping: 32 - WorkGroupMappingXCC: 2 + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false - WorkspaceCheck: [4, 0, 0] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: PartialsBuffer + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 @@ -33451,8 +37236,8 @@ _staggerStrideShift: 0 enableGLTrA: false enableGLTrB: false - enableLDSTrA: false - enableLDSTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 numSubTiles: 1 reorderGRInstForDTVA: false reorderGRInstForDTVB: false @@ -33460,285 +37245,327 @@ tailLoopOptB: true - [2, 3, 0, 1] - - - [233, 128, 1024, 32] - - [104, 0.0] + - [60, 0.0] - - [512, 8192, 1, 3072] - [0, 0.0] - - [512, 8192, 1, 3960] - - [60, 0.0] + - [38, 0.0] - - [512, 8192, 1, 5640] - - [61, 0.0] + - [39, 0.0] - - [528, 8192, 1, 256] - - [62, 0.0] + - [109, 57.71] - - [528, 8192, 1, 512] - - [1, 0.0] + - [110, 74.32] - - [1024, 8192, 1, 1980] - - [2, 0.0] + - [1, 0.0] - - [1024, 8192, 1, 3840] - - [3, 0.0] + - [2, 0.0] - - [2440, 8192, 1, 128] - - [63, 0.0] + - [94, 59.84] - - [5640, 8192, 1, 128] - - [4, 0.0] + - [3, 0.0] - - [61, 128, 8192, 40] - - [64, 0.0] + - [40, 0.0] - - [128, 30, 8192, 4] - - [5, 0.0] + - [116, 2.44] - - [128, 33, 8192, 16] - - [134, 14661.8] + - [117, 8.65] - - [128, 61, 8192, 40] - - [65, 0.0] + - [118, 18.88] - - [41, 17711, 1, 128] - - [121, 22531.0] + - [77, 22531.0] - - [96, 17711, 1, 768] - - [6, 0.0] + - [126, 51.2] - - [256, 17711, 1, 887] - - [124, 166959.0] + - [119, 78.04] - - [384, 17711, 1, 2732] - - [7, 0.0] + - [4, 0.0] - - [960, 17711, 1, 128] - - [90, 0.0] + - [120, 55.24] - - [2480, 17711, 1, 128] - - [125, 107650.0] + - [121, 63.59] - - [48, 124, 17711, 20] - - [8, 0.0] + - [127, 12.27] - - [128, 17711, 6, 128] - - [103, 0.0] + - [59, 0.0] - - [10, 655360, 1, 160] - - [9, 0.0] + - [5, 0.0] - - [28, 4096, 1, 256] - - [135, 11097.5] + - [151, 4.86] - - [32, 262144, 1, 57] - - [136, 34735.6] + - [114, 17.74] - - [32, 262144, 1, 60] - - [66, 0.0] + - [131, 17.58] - - [32, 262144, 1, 82] - - [10, 0.0] + - [6, 0.0] - - [32, 262144, 1, 84] - - [11, 0.0] + - [138, 18.4] - - [48, 655360, 1, 192] - - [12, 0.0] + - [7, 0.0] - - [57, 4096, 1, 2048] - - [68, 0.0] + - [42, 0.0] - - [64, 4096, 1, 2048] - - [13, 0.0] + - [142, 29.94] - - [64, 102400, 1, 64] - - [69, 0.0] + - [133, 31.44] - - [64, 131072, 1, 128] - - [14, 0.0] + - [145, 40.14] - - [64, 527553, 1, 224] - - [15, 0.0] + - [149, 54.03] - - [64, 752863, 1, 224] - - [71, 0.0] + - [43, 0.0] - - [64, 806154, 1, 288] - - [72, 0.0] + - [44, 0.0] - - [72, 4096, 1, 256] - - [16, 0.0] + - [8, 0.0] - - [82, 4096, 1, 2048] - - [17, 0.0] + - [146, 31.41] - - [112, 655360, 1, 192] - - [18, 0.0] + - [130, 60.1] - - [116, 4096, 1, 256] - - [19, 0.0] + - [9, 0.0] - - [128, 4096, 1, 1600] - - [20, 0.0] + - [146, 40.1] - - [128, 131072, 1, 64] - - [21, 0.0] + - [10, 0.0] - - [160, 4096, 1, 512] - - [22, 0.0] + - [147, 28.98] - - [160, 4096, 1, 2048] - - [80, 0.0] + - [46, 0.0] - - [180, 4096, 1, 256] - - [23, 0.0] + - [11, 0.0] - - [256, 4096, 1, 28] - - [24, 0.0] + - [104, 4.69] - - [256, 4096, 1, 72] - - [25, 0.0] + - [12, 0.0] - - [256, 4096, 1, 116] - - [26, 0.0] + - [13, 0.0] - - [256, 4096, 1, 256] - - [137, 63493.6] + - [136, 27.9] - - [256, 4096, 1, 4132] - - [83, 0.0] + - [49, 0.0] - - [256, 4096, 1, 7680] - - [27, 0.0] + - [14, 0.0] - - [304, 655360, 1, 644] - - [28, 0.0] + - [135, 94.46] - - [320, 4096, 1, 116] - - [29, 0.0] + - [15, 0.0] - - [320, 4096, 1, 180] - - [30, 0.0] + - [16, 0.0] - - [512, 4096, 1, 96] - - [31, 0.0] + - [17, 0.0] - - [512, 4096, 1, 160] - - [32, 0.0] + - [154, 37.99] - - [512, 4096, 1, 512] - - [33, 0.0] + - [144, 58.22] - - [512, 4096, 1, 2246] - - [34, 0.0] + - [148, 82.64] - - [512, 4096, 1, 4132] - - [89, 0.0] + - [52, 0.0] - - [512, 4096, 1, 7680] - - [35, 0.0] + - [134, 105.67] - - [2048, 4096, 1, 128] - - [93, 0.0] + - [54, 0.0] - - [2048, 4096, 1, 2048] - - [36, 0.0] + - [18, 0.0] - - [2048, 4096, 1, 2624] - - [37, 0.0] + - [19, 0.0] - - [2246, 4096, 1, 512] - - [38, 0.0] + - [20, 0.0] - - [2560, 4096, 1, 4096] - - [94, 0.0] + - [55, 0.0] - - [25, 25, 8192, 32] - - [96, 0.0] + - [56, 0.0] - - [32, 25, 8192, 25] - - [97, 0.0] + - [57, 0.0] - - [64, 57, 4096, 32] - - [98, 0.0] + - [105, 15.56] - - [64, 82, 4096, 32] - - [99, 0.0] + - [141, 18.45] - - [160, 642, 4096, 48] - - [39, 0.0] + - [153, 29.42] - - [200, 32, 4096, 64] - - [40, 0.0] + - [139, 18.97] - - [642, 160, 4096, 48] - - [41, 0.0] + - [21, 0.0] - - [128, 2048, 1, 256] - - [42, 0.0] + - [22, 0.0] - - [128, 2048, 1, 1024] - - [79, 0.0] + - [45, 0.0] - - [256, 2048, 1, 32] - - [43, 0.0] + - [23, 0.0] - - [256, 2048, 1, 36] - - [44, 0.0] + - [24, 0.0] - - [256, 2048, 1, 40] - - [45, 0.0] + - [25, 0.0] - - [256, 2048, 1, 48] - - [46, 0.0] + - [26, 0.0] - - [256, 2048, 1, 64] - - [120, 13745.7] + - [76, 13745.7] - - [256, 2048, 1, 72] - - [47, 0.0] + - [27, 0.0] - - [256, 2048, 1, 80] - - [48, 0.0] + - [28, 0.0] - - [256, 2048, 1, 96] - - [49, 0.0] + - [29, 0.0] - - [256, 2048, 1, 128] - - [50, 0.0] + - [30, 0.0] - - [256, 2048, 1, 256] - - [51, 0.0] + - [31, 0.0] - - [512, 2048, 1, 14336] - - [87, 0.0] + - [50, 0.0] - - [120, 8192, 1, 256] - - [127, 56756.2] + - [95, 24.98] - - [128, 8192, 1, 512] - - [130, 93560.3] + - [99, 41.14] - - [128, 8192, 1, 4352] - - [52, 0.0] + - [32, 0.0] - - [128, 8192, 1, 5120] - - [53, 0.0] + - [33, 0.0] - - [128, 8192, 1, 7296] - - [54, 0.0] + - [34, 0.0] - - [128, 98304, 1, 256] - - [131, 129793.0] + - [103, 73.12] - - [256, 8192, 1, 120] - - [55, 0.0] + - [104, 27.41] - - [256, 8192, 1, 128] - - [56, 0.0] + - [105, 32.62] - - [256, 8192, 1, 512] - - [57, 0.0] + - [35, 0.0] - - [256, 8192, 1, 4352] - - [58, 0.0] + - [36, 0.0] - - [512, 8192, 1, 1024] - - [133, 200366.0] + - [80, 200366.0] - - [512, 8192, 1, 2048] - - [59, 0.0] + - [37, 0.0] - - [56, 131072, 1, 233] - - [67, 0.0] + - [41, 0.0] - - [64, 131072, 1, 64] - - [70, 0.0] + - [132, 36.24] - - [128, 1024, 1, 64] - - [73, 0.0] + - [87, 1.72] - - [128, 1024, 1, 72] - - [74, 0.0] + - [89, 1.7] - - [128, 1024, 1, 96] - - [75, 0.0] + - [93, 2.37] - - [128, 1024, 1, 128] - - [76, 0.0] + - [82, 3.08] - - [128, 1024, 1, 144] - - [77, 0.0] + - [88, 2.98] - - [128, 1024, 1, 4096] - - [78, 0.0] + - [85, 31.71] - - [128, 17711, 1, 128] - - [122, 58658.9] + - [125, 27.26] - - [256, 1024, 1, 7968] - - [81, 0.0] + - [47, 0.0] - - [256, 4096, 1, 180] - - [82, 0.0] + - [48, 0.0] - - [320, 4096, 1, 28] - - [84, 0.0] + - [150, 5.38] - - [320, 4096, 1, 72] - - [85, 0.0] + - [141, 11.57] - - [512, 1024, 1, 2011] - - [86, 0.0] + - [86, 46.44] - - [512, 4096, 1, 80] - - [88, 0.0] + - [51, 0.0] - - [1024, 2048, 1, 14336] - - [91, 0.0] + - [53, 0.0] - - [2011, 1024, 1, 512] - - [92, 0.0] + - [83, 58.83] - - [7456, 1024, 1, 128] - - [95, 0.0] + - [91, 58.56] - - [64, 4096, 96, 160] - - [100, 0.0] + - [140, 45.87] - - [124, 48, 17711, 20] - - [101, 0.0] + - [58, 0.0] - - [128, 233, 1024, 32] - - [102, 0.0] + - [94, 25.13] - - [64, 9419, 1, 5120] - - [114, 0.0] + - [70, 0.0] - - [64, 9420, 1, 5120] - - [105, 0.0] + - [61, 0.0] - - [64, 18389, 1, 5120] - - [106, 0.0] + - [62, 0.0] - - [64, 18392, 1, 5120] - - [107, 0.0] + - [63, 0.0] - - [64, 21090, 1, 5120] - - [117, 0.0] + - [73, 0.0] - - [64, 21092, 1, 5120] - - [108, 0.0] + - [64, 0.0] - - [5120, 1, 1, 256] - - [109, 0.0] + - [65, 0.0] - - [5120, 1, 1, 5120] - - [110, 0.0] + - [66, 0.0] - - [30720, 1, 1, 5120] - - [111, 0.0] + - [67, 0.0] - - [64, 4106, 1, 5120] - - [112, 0.0] + - [68, 0.0] - - [64, 4200, 1, 5120] - - [113, 0.0] + - [69, 0.0] - - [64, 9450, 1, 5120] - - [115, 0.0] + - [71, 0.0] - - [64, 9452, 1, 5120] - - [116, 0.0] + - [72, 0.0] - - [64, 21263, 1, 5120] - - [118, 0.0] + - [74, 0.0] - - [64, 21264, 1, 5120] - - [119, 0.0] + - [75, 0.0] - - [128, 17711, 1, 928] - - [123, 144111.0] + - [123, 60.29] - - [17711, 246, 1, 384] - - [126, 123978.0] + - [78, 123978.0] - - [120, 8192, 1, 512] - - [128, 83879.6] + - [96, 36.23] - - [128, 8192, 1, 64] - - [129, 22804.1] + - [97, 10.73] - - [512, 8192, 1, 256] - - [132, 128855.0] + - [79, 128855.0] - - [512, 4096, 1, 64] - - [138, 37015.4] + - [152, 18.43] - - [4096, 1024, 1, 128] - - [139, 87046.7] + - [81, 87046.7] + - - [128, 1024, 1, 512] + - [84, 9.27] + - - [128, 1024, 1, 256] + - [90, 5.58] + - - [7968, 1024, 1, 256] + - [92, 77.41] + - - [128, 8192, 1, 128] + - [98, 18.51] + - - [128, 8192, 1, 5640] + - [100, 70.94] + - - [128, 8192, 1, 6912] + - [101, 76.65] + - - [128, 8192, 1, 10880] + - [102, 78.44] + - - [256, 8192, 1, 256] + - [106, 45.29] + - - [256, 8192, 1, 528] + - [107, 52.71] + - - [256, 8192, 1, 4608] + - [108, 92.0] + - - [1980, 8192, 1, 512] + - [111, 98.91] + - - [3072, 8192, 1, 512] + - [112, 110.5] + - - [5120, 8192, 1, 128] + - [113, 69.7] + - - [32, 128, 8192, 4] + - [114, 2.59] + - - [36, 128, 8192, 16] + - [115, 8.31] + - - [128, 17711, 1, 256] + - [122, 41.1] + - - [128, 17711, 1, 252] + - [124, 34.87] + - - [41, 128, 17711, 6] + - [128, 3.24] + - - [64, 819200, 1, 64] + - [129, 34.01] + - - [192, 160, 4096, 48] + - [137, 26.38] + - - [9216, 4096, 1, 512] + - [143, 114.52] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml new file mode 100644 index 00000000000..44c25a0a363 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx950/Equality/gfx950_Cijk_Alik_Bljk_S_MX_B_UserArgs.yaml @@ -0,0 +1,17818 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx950 +- gfx950 +- [Device 75a0] +- Activation: false + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: none + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DataTypeA: 0 + DataTypeAmaxD: 0 + DataTypeB: 0 + DataTypeE: 0 + DestDataType: 0 + F32XdlMathOp: 10 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 0 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 0 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3ZQt4emcWG8guHWikbv6OxFJy790l58gtvM3nfZjMIJE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34304 + LdsInitCVgprs: false + LdsNumBytes: 34304 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34304 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3e4Dw_hz57yPEZN_qoLaorfGepQNCz75gt6VQs5_mgZo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6nB1C3iOYxU_7DUOjuOyODiZ9rVSorIjLm4u2U10uDaw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 50176 + LdsInitCVgprs: false + LdsNumBytes: 50176 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 98816 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50176 + LdsOffsetMetadata_Blk: 98816 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3SwF0bvQxB0RrhRxtPMVt3TyizF16j4vW99jq_X9KpHk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30208 + LdsInitCVgprs: false + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3YeRtOHUsp9ttyStfNKrKENY_vaUUjqgLCrwM91_2-aY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3cSFHPElhrbZabUpjPK0idMvlUk8E6jLU5EZAOZd0T7g= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 30208 + LdsInitCVgprs: false + LdsNumBytes: 30208 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 25600 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 9216 + LdsOffsetMetadata_Blk: 25600 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x16x64_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1I2reK521Z0OJ8WcGapE1-BfPXpcGPiwVna1dA3-Ll3c= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ULSHV5TvKv-3s7N5kPB29fM38e7xZ96gnPlfW9sr_dw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13312 + LdsInitCVgprs: false + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1Gtebt9rQIpwe9oFGHoWcPG_t3BC0dLSpyLHC5C0Kq0M= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6V3GWrV8q1KT6qcNF3NRQe6SaKzZRXLdh2CbWinGWLAU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 8704 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6zsxRwkMqBm0-RotUdPfZ2aFGnmO-YpXpopyCt-rV2s4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67072 + LdsInitCVgprs: false + LdsNumBytes: 67072 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67072 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 4] + MIWaveTileA: 2 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HCXW7Md-kYsZ-1UnmrrC7uG9YBlO7z5sEs9KMCRnZRY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 1] + MIWaveTileA: 2 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT68SIh_bnF1917_7343uVm45oe_0GQqSHLfLK0W_wi_u8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6oabzLLQwOO03l3MEmNqlhJ6dLm9NnzDdPu-7gnuHCJQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6R02QOCgIPWslloJiox9BjhuIrZ3CRli_uOaWUztN7xk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99840 + LdsInitCVgprs: false + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 66560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1AxfD8GQK1funv0jxGwFu2nRz6aluGc9YL9jcdz-7fEk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 131072 + LdsInitCVgprs: false + LdsNumBytes: 131072 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 99840 + LdsInitCVgprs: false + LdsNumBytes: 99840 + LdsNumElementsAlignedA: 66560 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 66560 + LdsOffsetB_Blk: 197632 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 99840 + LdsOffsetMetadata_Blk: 197632 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wZn1lj5-xh86AxV8XC_TfIrJlVoj_3cWBkWnmMVsin8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 72704 + LdsInitCVgprs: false + LdsNumBytes: 72704 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 55296 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 72704 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 6] + MIWaveTileA: 8 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 384 + MacroTileA: 128 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 48 + NumLoadsA: 4 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 6 + ThreadTileA: 32 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65gE-MCZm4_zHo6C5uZDRUoKD8EXmAPJ3N4uk_Y8yfys= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6I8O4paswkmb66D5oBp-XVQKwsaElMfkk9UWM0xAgsVs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6eRXgyMefjCuWXVIY2LbceXYoQhTimXSfxsYDWpNFOp0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 51712 + LdsInitCVgprs: false + LdsNumBytes: 51712 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 51712 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT189iqYKY55VEcPzDOXv9ylhp1p78P2DCBr2S6_6nafkw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 46080 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 46080 + LdsOffsetB_Blk: 177152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 177152 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 128 + MacroTileA: 160 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 10 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 10 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 90624 + LdsInitCVgprs: false + LdsNumBytes: 90624 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 67584 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 90624 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [5, 4] + MIWaveTileA: 5 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 80 + MacroTile1: 256 + MacroTileA: 80 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 5 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 80 + NumLoadsA: 5 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT80x256x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT5_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS5_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 20 + ThreadTile1: 4 + ThreadTileA: 20 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 87040 + LdsInitCVgprs: false + LdsNumBytes: 87040 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 69632 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 148480 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 87040 + LdsOffsetMetadata_Blk: 148480 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 512 + MacroTileA: 128 + MacroTileB: 512 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x512x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT8_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 32 + ThreadTile1: 8 + ThreadTileA: 32 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128000 + LdsInitCVgprs: false + LdsNumBytes: 128000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1U7vOvdYFB8EejoI1DeZ2XXlfHVNEE5d0H12vbNNpATs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3OvWxntshUWW3_A-VI6XNSghru3U4UDSTDhjcjVR3HKY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT46N5m9gwCFO-Nzc9yENVU3SI1QrpCiOXEjeEPFSn86Hk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1vaiUWgr6lZ68qXC_TGMU65523uOYKq3Ec6eraxi_h38= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1pFiexEy_nQA9jS434_CfI7aGjSUEPvYPE95_vzPQKTw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 7680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 48 + MacroTileA: 128 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x48x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1H4SRQMHnBm8MPmn7CC4vovQuF9Klt0xNrGusZalzPig= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26624 + LdsInitCVgprs: false + LdsNumBytes: 26624 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26624 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 2] + MIWaveTileA: 4 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 2 + ThreadTileA: 16 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1rXTuKdoyxclHidecoClA0AfcqpENiqcNipr2eRQNhwg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 79872 + LdsInitCVgprs: false + LdsNumBytes: 79872 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 46080 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 79872 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9zD0PX1FPhAiuZDs_KjdPAn76aiPUEhRhVS2tkop5n3A= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 44544 + LdsInitCVgprs: false + LdsNumBytes: 44544 + LdsNumElementsAlignedA: 13824 + LdsNumElementsAlignedB: 30720 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 13824 + LdsOffsetB_Blk: 79360 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 44544 + LdsOffsetMetadata_Blk: 79360 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [6, 3] + MIWaveTileA: 6 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 192 + MacroTileA: 96 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 72 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 3 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 3 + ThreadTileA: 24 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT19UJ9YnHb6cR5JwasXZkP1sk4AUOzy6Nd_GCne8pVOR0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 160 + MacroTileA: 128 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 4 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1wWhUNhsLXnwdUgsCNHLafWrE-J4dg946fN2_q4HV5w8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 57856 + LdsInitCVgprs: false + LdsNumBytes: 57856 + LdsNumElementsAlignedA: 34816 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 34816 + LdsOffsetB_Blk: 100352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 57856 + LdsOffsetMetadata_Blk: 100352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 81920 + LdsInitCVgprs: false + LdsNumBytes: 81920 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 23040 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 99328 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 56832 + LdsOffsetMetadata_Blk: 99328 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 80 + MacroTileA: 128 + MacroTileB: 80 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 10 + NumLoadsA: 8 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x80x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1KkEUdHGIe6dKJ0zzMlcYFBbwtI1FwPuq4-b2hkE_RVc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 32768 + LdsInitCVgprs: false + LdsNumBytes: 32768 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 32768 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 3] + MIWaveTileA: 4 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 3 + ThreadTileA: 16 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT65_iMlfIkk8B93jR0j9fItJcWD8qLSyBShDieS7L1wt0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 31232 + LdsInitCVgprs: false + LdsNumBytes: 31232 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 13824 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 50176 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 31232 + LdsOffsetMetadata_Blk: 50176 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 48 + MacroTileA: 64 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 12 + NumGlobalWriteVectorsPerThread: 6 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x48x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_4_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 4, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT9gGJDLTTlViLZGJavc0sPMnxgvCGjSJIqu4gt9wnKCkU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 55296 + LdsInitCVgprs: false + LdsNumBytes: 55296 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 27648 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 55296 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [3, 3] + MIWaveTileA: 3 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 96 + MacroTile1: 96 + MacroTileA: 96 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 36 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 6 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT96x96x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 3 + ThreadTileA: 12 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45ulS7WcdI7UNW-ipBhI5_9g0NWEweK1v4iw5AdMdwyw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 2 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB2_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB2_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT45Pf77PZ0y2a63oqBv5xuSEpvYfkv5rrfz2OggkgqUbU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 26112 + LdsInitCVgprs: false + LdsNumBytes: 26112 + LdsNumElementsAlignedA: 7680 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 7680 + LdsOffsetB_Blk: 40448 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 26112 + LdsOffsetMetadata_Blk: 40448 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [3, 2] + MIWaveTileA: 3 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 48 + MacroTile1: 128 + MacroTileA: 48 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT48x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT3_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 12 + ThreadTile1: 2 + ThreadTileA: 12 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6-PpbdGeEL52tGWSKuV1d3aERKjhvbI9dAbn_gKJooqY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 59904 + LdsInitCVgprs: false + LdsNumBytes: 59904 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 51200 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 59904 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 5] + MIWaveTileA: 4 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 320 + MacroTileA: 64 + MacroTileB: 320 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 80 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 10 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 10 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x320x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 5 + ThreadTileA: 16 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1zKzQos5jBY-oZsuABvfgC-YVr0XQkeZ9FoaVzwSEWkA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 2 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT112x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS7_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 48640 + LdsInitCVgprs: false + LdsNumBytes: 48640 + LdsNumElementsAlignedA: 17920 + LdsNumElementsAlignedB: 30720 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17920 + LdsOffsetB_Blk: 83456 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 48640 + LdsOffsetMetadata_Blk: 83456 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [7, 3] + MIWaveTileA: 7 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 112 + MacroTile1: 192 + MacroTileA: 112 + MacroTileB: 192 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 7 + NumElementsPerThread: 84 + NumGlobalWriteVectorsPerThread: 84 + NumLoadsA: 7 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 7 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT112x192x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA2_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT7_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS7_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 28 + ThreadTile1: 3 + ThreadTileA: 28 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3v1NwFnVXJTJCep_voPt7RyBBkZRSCMUMhO_N-SZ5gL8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 37376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 37376 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6C54sGtPjCJ3V-cPRAE3Ns8NHt0h_voSZTrp24ROARTw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6uAdk8COtu0uSDFSBZJBI7WxTlIvedN3IPIedLDwGas0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 94720 + LdsInitCVgprs: false + LdsNumBytes: 94720 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 71680 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 94720 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 7] + MIWaveTileA: 10 + MIWaveTileB: 7 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 448 + MacroTileA: 160 + MacroTileB: 448 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 280 + NumGlobalWriteVectorsPerThread: 140 + NumLoadsA: 5 + NumLoadsB: 14 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 14 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x448x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_7_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 7 + ThreadTileA: 40 + ThreadTileB: 7 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hKaBSm-LDSAQofLbqhwHQZEeIsXzaeyVIdwWTVQFEe8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 17408 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 17408 + LdsOffsetB_Blk: 82944 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 82944 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6sFfDRoimNH8imnTTvcLS2cE_cU4dhe0vq9JydCcQHaY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 34816 + LdsInitCVgprs: false + LdsNumBytes: 34816 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 25600 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 74752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 34816 + LdsOffsetMetadata_Blk: 74752 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 5] + MIWaveTileA: 2 + MIWaveTileB: 5 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 160 + MacroTileA: 64 + MacroTileB: 160 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 40 + NumGlobalWriteVectorsPerThread: 20 + NumLoadsA: 2 + NumLoadsB: 5 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 5 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x160x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_5_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 5 + ThreadTileA: 8 + ThreadTileB: 5 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 22528 + LdsInitCVgprs: false + LdsNumBytes: 22528 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 17408 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 37888 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 22528 + LdsOffsetMetadata_Blk: 37888 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 4] + MIWaveTileA: 1 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x128x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA128_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1EcUjFXB929D0f2TV76fVey2j0aZunr_I-f76y9iEKD8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 23040 + LdsInitCVgprs: false + LdsNumBytes: 23040 + LdsNumElementsAlignedA: 18432 + LdsNumElementsAlignedB: 4608 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 18432 + LdsOffsetB_Blk: 51200 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 23040 + LdsOffsetMetadata_Blk: 51200 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [4, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x32x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [64, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6mLBE_Dn-_3QTzaoio4b0fzaYzT15iyos9J3lxNRut4U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT3UA89VtDjVMYF_nVEcLHiT6ehHteA9v56y_f3v5d_CYI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x32x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 128000 + LdsInitCVgprs: false + LdsNumBytes: 128000 + LdsNumElementsAlignedA: 27648 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 27648 + LdsOffsetB_Blk: 93184 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 27648 + LdsOffsetMetadata_Blk: 93184 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [6, 8] + MIWaveTileA: 6 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 192 + MacroTile1: 256 + MacroTileA: 192 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 6 + NumElementsPerThread: 192 + NumGlobalWriteVectorsPerThread: 96 + NumLoadsA: 6 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 6 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT192x256x32_MI16x16x1_SN_LDSB0_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS6_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 24 + ThreadTile1: 8 + ThreadTileA: 24 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6Q86Al2n33vUP8nQtrYZlZYGsrzmmIC_0MPgq_9eYXoc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2 + LDSTrInst: 0 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 50688 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 2 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_8_2_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 100864 + LdsInitCVgprs: false + LdsNumBytes: 100864 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 92160 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 139776 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 100864 + LdsOffsetMetadata_Blk: 139776 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 9] + MIWaveTileA: 4 + MIWaveTileB: 9 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 576 + MacroTileA: 64 + MacroTileB: 576 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 144 + NumGlobalWriteVectorsPerThread: 36 + NumLoadsA: 2 + NumLoadsB: 18 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 18 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x576x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA512_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_9_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM4_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 9 + ThreadTileA: 16 + ThreadTileB: 9 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6hbz-dyf7FwiPWJ4x45PK0ZJquMKV4n_z15-8SqAfiXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 2048 + LdsBlockSizePerPadB: 2048 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 66560 + LdsInitCVgprs: false + LdsNumBytes: 66560 + LdsNumElementsAlignedA: 33280 + LdsNumElementsAlignedB: 33280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33280 + LdsOffsetB_Blk: 164352 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 66560 + LdsOffsetMetadata_Blk: 164352 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA2048_LBSPPB2048_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 0 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43008 + LdsInitCVgprs: false + LdsNumBytes: 43008 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 26112 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43008 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 48 + MacroTileA: 32 + MacroTileB: 48 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 6 + NumGlobalWriteVectorsPerThread: 3 + NumLoadsA: 4 + NumLoadsB: 6 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 6 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT32x48x128_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 512 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1xL-j8Nchxvh82x3gA1ZugFs4j6lELgVzV2fA-hmOsN8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 67584 + LdsInitCVgprs: false + LdsNumBytes: 67584 + LdsNumElementsAlignedA: 33792 + LdsNumElementsAlignedB: 33792 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 33792 + LdsOffsetB_Blk: 164864 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 67584 + LdsOffsetMetadata_Blk: 164864 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT128x128x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT67lNyUdTmw-cOQZDz4eeTMX4-DhVCJMsSaMIubs8z554= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 512 + LdsBlockSizePerPadB: 512 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 43520 + LdsInitCVgprs: false + LdsNumBytes: 43520 + LdsNumElementsAlignedA: 8704 + LdsNumElementsAlignedB: 34816 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 8704 + LdsOffsetB_Blk: 74240 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 43520 + LdsOffsetMetadata_Blk: 74240 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 256 + MacroTileA: 64 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x256x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA512_LBSPPB512_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_4_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB4_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 4 + ThreadTileA: 16 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6eM9UlY0usrKvssU3Ar5zZiiJpm6PCupFKVpS-ZFqsX0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 24576 + LdsInitCVgprs: false + LdsNumBytes: 24576 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 15360 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 24576 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 3] + MIWaveTileA: 2 + MIWaveTileB: 3 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 3 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x96x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB128_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_3_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 3 + ThreadTileA: 8 + ThreadTileB: 3 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ByCgfye7rWIPQ1YJohmIQ0lNCn8NrEKn7vXmxjyaZFM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4 + LDSTrInst: 1 + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 1024 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 33792 + LdsInitCVgprs: false + LdsNumBytes: 33792 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 16896 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 33792 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 4 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT16x16x256_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS1024_SPO0_SRVW0_SSO0_SVW1_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO1_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG16_4_4_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 1024 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6et-9-ykpgaai9G-L1VaR5RDg1xNbnI7j11JfZuQQm7o= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 1024 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 35328 + LdsInitCVgprs: false + LdsNumBytes: 35328 + LdsNumElementsAlignedA: 16896 + LdsNumElementsAlignedB: 18432 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 65536 + LdsOffsetB: 16896 + LdsOffsetB_Blk: 82432 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 35328 + LdsOffsetMetadata_Blk: 82432 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 64 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [4, 1] + MIWaveTileA: 4 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x64_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT4_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW4_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA4_VWB1_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 4 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 16 + ThreadTile1: 1 + ThreadTileA: 16 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 4 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT1ZglzLcUOj7D9-4HA3xQ6qm78zoBYVUYUrp0ST-4N2Tc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1 + LDSTrInst: 1 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 78336 + LdsInitCVgprs: false + LdsNumBytes: 78336 + LdsNumElementsAlignedA: 23040 + LdsNumElementsAlignedB: 55296 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 23040 + LdsOffsetB_Blk: 154112 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 78336 + LdsOffsetMetadata_Blk: 154112 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [1, 4] + MIWaveTile: [10, 6] + MIWaveTileA: 10 + MIWaveTileB: 6 + MIWaveTileMetadata: 0 + MacroTile0: 160 + MacroTile1: 384 + MacroTileA: 160 + MacroTileB: 384 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 10 + NumElementsPerThread: 240 + NumGlobalWriteVectorsPerThread: 120 + NumLoadsA: 5 + NumLoadsB: 12 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 5 + NumLoadsPerpendicularB: 12 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT160x384x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT10_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS10_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 64 + SubGroupA: 4 + SubGroupB: 64 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 40 + ThreadTile1: 6 + ThreadTileA: 40 + ThreadTileB: 6 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: false + ActivationFused: true + AdaptiveGemm: 1 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_S_MX_B_Bias_HA_S_SAV_UserArgs_MT6HKFsR1xpqIqB8iGFlETrmdcYqlXRxk0kl_LfZapGERo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XdlMathOp: true + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: 1 + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: -1 + GlobalSplitUAlgorithm: MultipleBufferSingleKernel + GlobalSplitUCoalesced: true + GlobalSplitUWorkGroupMappingRoundRobin: true + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [9, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUAMBSK_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1 + LDSTrInst: 0 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 18432 + LdsInitCVgprs: false + LdsNumBytes: 18432 + LdsNumElementsAlignedA: 9216 + LdsNumElementsAlignedB: 9216 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 9216 + LdsOffsetB_Blk: 41984 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 18432 + LdsOffsetMetadata_Blk: 41984 + LdsPadA: 8 + LdsPadB: 8 + LdsPadMetadata: 0 + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: false + MIArchVgpr: false + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 4 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [2, 2] + MIWaveTileA: 2 + MIWaveTileB: 2 + MIWaveTileMetadata: 0 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 163840 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 4 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: true + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_S_MX_B_UserArgs_MT64x64x32_MI16x16x1_SN_LDSB1_AFC0_AG1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI1_GRPM1_GRVWA4_GRVWB4_GSUn1_GSUAMBSK_GSUC1_GSUWGMRR1_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA256_LBSPPB256_LBSPPM0_LPA8_LPB8_LPM0_LRVW4_LWPMn1_MIAV0_MIWT2_2_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS8_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA1_SIA3_SS1_SU16_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK0_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA2_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM1_WGMXCC8_WGMXCCGn1 + SourceSwap: 1 + SpaceFillingAlgo: [] + StaggerU: 16 + StaggerUMapping: 0 + StaggerUStride: 128 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 2 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: false + SynchronizerSizeCheck: 1 + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseCustomMainLoopSchedule: false + UseDirect32XEmulation: true + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: true + UseInstOffsetForGRO: 0 + UsePLRPack: false + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 2 + VectorWidthB: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 64 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, -1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBufferSingleKernel + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: 0 + enableLDSTrB: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: true + tailLoopOptB: true +- [2, 3, 0, 1] +- - - [128, 1024, 1, 128] + - [0, 3.08] + - - [2011, 1024, 1, 512] + - [1, 58.83] + - - [128, 1024, 1, 512] + - [2, 9.27] + - - [128, 1024, 1, 4096] + - [3, 31.71] + - - [512, 1024, 1, 2011] + - [4, 46.44] + - - [128, 1024, 1, 64] + - [5, 1.72] + - - [128, 1024, 1, 144] + - [6, 2.98] + - - [128, 1024, 1, 72] + - [7, 1.7] + - - [128, 1024, 1, 256] + - [8, 5.58] + - - [7456, 1024, 1, 128] + - [9, 58.56] + - - [7968, 1024, 1, 256] + - [10, 77.41] + - - [128, 1024, 1, 96] + - [11, 2.37] + - - [128, 233, 1024, 32] + - [12, 25.13] + - - [120, 8192, 1, 256] + - [13, 24.98] + - - [120, 8192, 1, 512] + - [14, 36.23] + - - [128, 8192, 1, 64] + - [15, 10.73] + - - [128, 8192, 1, 128] + - [16, 18.51] + - - [128, 8192, 1, 512] + - [17, 41.14] + - - [128, 8192, 1, 5640] + - [18, 70.94] + - - [128, 8192, 1, 6912] + - [19, 76.65] + - - [128, 8192, 1, 10880] + - [20, 78.44] + - - [128, 98304, 1, 256] + - [21, 73.12] + - - [256, 8192, 1, 120] + - [22, 27.41] + - - [256, 8192, 1, 128] + - [23, 32.62] + - - [256, 8192, 1, 256] + - [24, 45.29] + - - [256, 8192, 1, 528] + - [25, 52.71] + - - [256, 8192, 1, 4608] + - [26, 92.0] + - - [528, 8192, 1, 256] + - [27, 57.71] + - - [528, 8192, 1, 512] + - [28, 74.32] + - - [1980, 8192, 1, 512] + - [29, 98.91] + - - [2440, 8192, 1, 128] + - [12, 59.84] + - - [3072, 8192, 1, 512] + - [30, 110.5] + - - [5120, 8192, 1, 128] + - [31, 69.7] + - - [32, 128, 8192, 4] + - [32, 2.59] + - - [36, 128, 8192, 16] + - [33, 8.31] + - - [128, 30, 8192, 4] + - [34, 2.44] + - - [128, 33, 8192, 16] + - [35, 8.65] + - - [128, 61, 8192, 40] + - [36, 18.88] + - - [256, 17711, 1, 887] + - [37, 78.04] + - - [960, 17711, 1, 128] + - [38, 55.24] + - - [2480, 17711, 1, 128] + - [39, 63.59] + - - [128, 17711, 1, 256] + - [40, 41.1] + - - [128, 17711, 1, 928] + - [41, 60.29] + - - [128, 17711, 1, 252] + - [42, 34.87] + - - [128, 17711, 1, 128] + - [43, 27.26] + - - [96, 17711, 1, 768] + - [44, 51.2] + - - [48, 124, 17711, 20] + - [45, 12.27] + - - [41, 128, 17711, 6] + - [46, 3.24] + - - [64, 819200, 1, 64] + - [47, 34.01] + - - [112, 655360, 1, 192] + - [48, 60.1] + - - [32, 262144, 1, 60] + - [49, 17.58] + - - [32, 262144, 1, 57] + - [32, 17.74] + - - [64, 131072, 1, 64] + - [50, 36.24] + - - [64, 102400, 1, 64] + - [51, 31.44] + - - [512, 4096, 1, 7680] + - [52, 105.67] + - - [304, 655360, 1, 644] + - [53, 94.46] + - - [256, 4096, 1, 256] + - [54, 27.9] + - - [192, 160, 4096, 48] + - [55, 26.38] + - - [64, 57, 4096, 32] + - [23, 15.56] + - - [32, 262144, 1, 84] + - [56, 18.4] + - - [200, 32, 4096, 64] + - [57, 18.97] + - - [64, 4096, 96, 160] + - [58, 45.87] + - - [64, 82, 4096, 32] + - [59, 18.45] + - - [64, 4096, 1, 2048] + - [60, 29.94] + - - [9216, 4096, 1, 512] + - [61, 114.52] + - - [512, 4096, 1, 512] + - [62, 58.22] + - - [64, 131072, 1, 128] + - [63, 40.14] + - - [82, 4096, 1, 2048] + - [64, 31.41] + - - [160, 4096, 1, 512] + - [65, 28.98] + - - [128, 4096, 1, 1600] + - [64, 40.1] + - - [320, 4096, 1, 72] + - [59, 11.57] + - - [512, 4096, 1, 2246] + - [66, 82.64] + - - [64, 527553, 1, 224] + - [67, 54.03] + - - [320, 4096, 1, 28] + - [68, 5.38] + - - [28, 4096, 1, 256] + - [69, 4.86] + - - [512, 4096, 1, 64] + - [70, 18.43] + - - [160, 642, 4096, 48] + - [71, 29.42] + - - [256, 4096, 1, 28] + - [22, 4.69] + - - [512, 4096, 1, 160] + - [72, 37.99] +- null +- null +- DeviceEfficiency +- Equality diff --git a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py index 5b9c3e6ff19..74a7f0f450a 100644 --- a/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py +++ b/projects/hipblaslt/tensilelite/Tensile/SolutionStructs/Solution.py @@ -2017,6 +2017,9 @@ def calSwizzlePackK(state, tc): reject(state, printRejectionReason, "GRVWB * DataTypeB.numBytes() > 16") disableGNLC = False # Set to true to disable GNLC if needed + # Temporary hack, if usesgprforgro is set to 1 to save vgprs, disable GNLC + if state["UseSgprForGRO"] == 1: + disableGNLC = True isMixedPrec = (state["ProblemType"]["DataTypeA"].numBytes() != state["ProblemType"]["DataTypeB"].numBytes()) if state["DirectToLds"] and state["LocalSplitU"] == 1 \ and not isMixedPrec and not state["ProblemType"]["Sparse"] \